{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:4Q5XUAZ334OCLVGJRTDYNWGGXY","short_pith_number":"pith:4Q5XUAZ3","canonical_record":{"source":{"id":"2410.13848","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-10-17T17:58:37Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"5fdd467fda2d0c8f5f6d4b978c5f95603eb588043dd172c2860ea2b4301e3512","abstract_canon_sha256":"ccac9affd22a03abd24c36a6f1dbfd031a9ff1e4871892b274524fd558580680"},"schema_version":"1.0"},"canonical_sha256":"e43b7a033bdf1c25d4c98cc786d8c6be3f0a249cee13c37b6adeec1632aa6689","source":{"kind":"arxiv","id":"2410.13848","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2410.13848","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2410.13848v1","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.13848","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"4Q5XUAZ334OC","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"4Q5XUAZ334OCLVGJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"4Q5XUAZ3","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:4Q5XUAZ334OCLVGJRTDYNWGGXY","target":"record","payload":{"canonical_record":{"source":{"id":"2410.13848","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-10-17T17:58:37Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"5fdd467fda2d0c8f5f6d4b978c5f95603eb588043dd172c2860ea2b4301e3512","abstract_canon_sha256":"ccac9affd22a03abd24c36a6f1dbfd031a9ff1e4871892b274524fd558580680"},"schema_version":"1.0"},"canonical_sha256":"e43b7a033bdf1c25d4c98cc786d8c6be3f0a249cee13c37b6adeec1632aa6689","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.992425Z","signature_b64":"DBZHmxwTahZ5s6Lem4K5IbEgQP+Yj2BynNUvm5kXkKENtAY+juRJ0QSG7jzqEeKtvGhvguoXKyqgZyLeLKGKBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e43b7a033bdf1c25d4c98cc786d8c6be3f0a249cee13c37b6adeec1632aa6689","last_reissued_at":"2026-05-17T23:38:49.991947Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.991947Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2410.13848","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"nKK/bxP8iHPHn1q9V54UCaG2Yfdh7pxE0+LneT6hibLO9G3uaV3N28iwTq5lu6EJc4rOWoxjU56Ybna37ELJCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T10:02:34.297482Z"},"content_sha256":"3522ea6adbe059da539b6bcb205748aaeee6de695bec0aa174e5afa1bf4d4393","schema_version":"1.0","event_id":"sha256:3522ea6adbe059da539b6bcb205748aaeee6de695bec0aa174e5afa1bf4d4393"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:4Q5XUAZ334OCLVGJRTDYNWGGXY","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Janus: Decoupling Visual Encoding for Unified Multimodal Understanding and Generation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Decoupling the visual encoder into separate pathways lets a single transformer handle both multimodal understanding and generation without performance trade-offs.","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.CV","authors_text":"Chengyue Wu, Chong Ruan, Ping Luo, Wen Liu, Xiaokang Chen, Xingchao Liu, Xingkai Yu, Yiyang Ma, Zhenda Xie, Zhiyu Wu, Zizheng Pan","submitted_at":"2024-10-17T17:58:37Z","abstract_excerpt":"In this paper, we introduce Janus, an autoregressive framework that unifies multimodal understanding and generation. Prior research often relies on a single visual encoder for both tasks, such as Chameleon. However, due to the differing levels of information granularity required by multimodal understanding and generation, this approach can lead to suboptimal performance, particularly in multimodal understanding. To address this issue, we decouple visual encoding into separate pathways, while still leveraging a single, unified transformer architecture for processing. The decoupling not only all"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experiments show that Janus surpasses previous unified model and matches or exceeds the performance of task-specific models.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the conflict arising from differing information granularity in understanding versus generation is the main performance bottleneck and that decoupling the encoders will resolve it without introducing new integration problems in the shared transformer.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Janus decouples visual encoding into task-specific pathways inside a single autoregressive transformer to unify multimodal understanding and generation while outperforming earlier unified models.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Decoupling the visual encoder into separate pathways lets a single transformer handle both multimodal understanding and generation without performance trade-offs.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"0a66796c58d4c14dc82e2dcafb1bb013f32a08214434d4518f56713452f97e99"},"source":{"id":"2410.13848","kind":"arxiv","version":1},"verdict":{"id":"b9051b6c-71f6-4519-adc3-0409a76051bf","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T22:06:37.001843Z","strongest_claim":"Experiments show that Janus surpasses previous unified model and matches or exceeds the performance of task-specific models.","one_line_summary":"Janus decouples visual encoding into task-specific pathways inside a single autoregressive transformer to unify multimodal understanding and generation while outperforming earlier unified models.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the conflict arising from differing information granularity in understanding versus generation is the main performance bottleneck and that decoupling the encoders will resolve it without introducing new integration problems in the shared transformer.","pith_extraction_headline":"Decoupling the visual encoder into separate pathways lets a single transformer handle both multimodal understanding and generation without performance trade-offs."},"references":{"count":96,"sample":[{"doi":"","year":2023,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","ref_index":1,"cited_arxiv_id":"2303.08774","is_internal_anchor":true},{"doi":"","year":2024,"title":"The claude 3 model family: Opus, sonnet, haiku","work_id":"bf65b1ed-fd04-46c8-820a-913e0eacf201","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","ref_index":3,"cited_arxiv_id":"2308.12966","is_internal_anchor":true},{"doi":"","year":2023,"title":"arXiv preprint arXiv:2306.16934 (2023)","work_id":"3754e7d4-269d-41a6-b9f3-7c94d2e69e99","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"DeepSeek LLM: Scaling Open-Source Language Models with Longtermism","work_id":"01b10587-025b-499d-8ba3-7a538d24c2d6","ref_index":5,"cited_arxiv_id":"2401.02954","is_internal_anchor":true}],"resolved_work":96,"snapshot_sha256":"f57c3c4a6cfc64e099b1dde944eea67b9c76ea714a5b1b47a689cf71314fb373","internal_anchors":38},"formal_canon":{"evidence_count":3,"snapshot_sha256":"eebe8bb6914d2bce0a691e3d2b4d26c46d98efec80ca32906d4c7efc081bd46a"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"b9051b6c-71f6-4519-adc3-0409a76051bf"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"0emSbH2Pb2jbTR5Rqqj+jbJxKNttEHOwJk5Plf+fOXdM+5XepmQ2aEy9NJQm05b736UkEYhxi2BYQc7G7ePYCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T10:02:34.298010Z"},"content_sha256":"e04a3b0cb4103915b3e4bddab419c94b74c129b204700d3ebdc4025918c68c9d","schema_version":"1.0","event_id":"sha256:e04a3b0cb4103915b3e4bddab419c94b74c129b204700d3ebdc4025918c68c9d"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/4Q5XUAZ334OCLVGJRTDYNWGGXY/bundle.json","state_url":"https://pith.science/pith/4Q5XUAZ334OCLVGJRTDYNWGGXY/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/4Q5XUAZ334OCLVGJRTDYNWGGXY/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T10:02:34Z","links":{"resolver":"https://pith.science/pith/4Q5XUAZ334OCLVGJRTDYNWGGXY","bundle":"https://pith.science/pith/4Q5XUAZ334OCLVGJRTDYNWGGXY/bundle.json","state":"https://pith.science/pith/4Q5XUAZ334OCLVGJRTDYNWGGXY/state.json","well_known_bundle":"https://pith.science/.well-known/pith/4Q5XUAZ334OCLVGJRTDYNWGGXY/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:4Q5XUAZ334OCLVGJRTDYNWGGXY","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"ccac9affd22a03abd24c36a6f1dbfd031a9ff1e4871892b274524fd558580680","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-10-17T17:58:37Z","title_canon_sha256":"5fdd467fda2d0c8f5f6d4b978c5f95603eb588043dd172c2860ea2b4301e3512"},"schema_version":"1.0","source":{"id":"2410.13848","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2410.13848","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2410.13848v1","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.13848","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"4Q5XUAZ334OC","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"4Q5XUAZ334OCLVGJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"4Q5XUAZ3","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:e04a3b0cb4103915b3e4bddab419c94b74c129b204700d3ebdc4025918c68c9d","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experiments show that Janus surpasses previous unified model and matches or exceeds the performance of task-specific models."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the conflict arising from differing information granularity in understanding versus generation is the main performance bottleneck and that decoupling the encoders will resolve it without introducing new integration problems in the shared transformer."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Janus decouples visual encoding into task-specific pathways inside a single autoregressive transformer to unify multimodal understanding and generation while outperforming earlier unified models."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Decoupling the visual encoder into separate pathways lets a single transformer handle both multimodal understanding and generation without performance trade-offs."}],"snapshot_sha256":"0a66796c58d4c14dc82e2dcafb1bb013f32a08214434d4518f56713452f97e99"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"eebe8bb6914d2bce0a691e3d2b4d26c46d98efec80ca32906d4c7efc081bd46a"},"paper":{"abstract_excerpt":"In this paper, we introduce Janus, an autoregressive framework that unifies multimodal understanding and generation. Prior research often relies on a single visual encoder for both tasks, such as Chameleon. However, due to the differing levels of information granularity required by multimodal understanding and generation, this approach can lead to suboptimal performance, particularly in multimodal understanding. To address this issue, we decouple visual encoding into separate pathways, while still leveraging a single, unified transformer architecture for processing. The decoupling not only all","authors_text":"Chengyue Wu, Chong Ruan, Ping Luo, Wen Liu, Xiaokang Chen, Xingchao Liu, Xingkai Yu, Yiyang Ma, Zhenda Xie, Zhiyu Wu, Zizheng Pan","cross_cats":["cs.AI","cs.CL"],"headline":"Decoupling the visual encoder into separate pathways lets a single transformer handle both multimodal understanding and generation without performance trade-offs.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-10-17T17:58:37Z","title":"Janus: Decoupling Visual Encoding for Unified Multimodal Understanding and Generation"},"references":{"count":96,"internal_anchors":38,"resolved_work":96,"sample":[{"cited_arxiv_id":"2303.08774","doi":"","is_internal_anchor":true,"ref_index":1,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"The claude 3 model family: Opus, sonnet, haiku","work_id":"bf65b1ed-fd04-46c8-820a-913e0eacf201","year":2024},{"cited_arxiv_id":"2308.12966","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"arXiv preprint arXiv:2306.16934 (2023)","work_id":"3754e7d4-269d-41a6-b9f3-7c94d2e69e99","year":2023},{"cited_arxiv_id":"2401.02954","doi":"","is_internal_anchor":true,"ref_index":5,"title":"DeepSeek LLM: Scaling Open-Source Language Models with Longtermism","work_id":"01b10587-025b-499d-8ba3-7a538d24c2d6","year":2024}],"snapshot_sha256":"f57c3c4a6cfc64e099b1dde944eea67b9c76ea714a5b1b47a689cf71314fb373"},"source":{"id":"2410.13848","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T22:06:37.001843Z","id":"b9051b6c-71f6-4519-adc3-0409a76051bf","model_set":{"reader":"grok-4.3"},"one_line_summary":"Janus decouples visual encoding into task-specific pathways inside a single autoregressive transformer to unify multimodal understanding and generation while outperforming earlier unified models.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Decoupling the visual encoder into separate pathways lets a single transformer handle both multimodal understanding and generation without performance trade-offs.","strongest_claim":"Experiments show that Janus surpasses previous unified model and matches or exceeds the performance of task-specific models.","weakest_assumption":"That the conflict arising from differing information granularity in understanding versus generation is the main performance bottleneck and that decoupling the encoders will resolve it without introducing new integration problems in the shared transformer."}},"verdict_id":"b9051b6c-71f6-4519-adc3-0409a76051bf"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:3522ea6adbe059da539b6bcb205748aaeee6de695bec0aa174e5afa1bf4d4393","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"ccac9affd22a03abd24c36a6f1dbfd031a9ff1e4871892b274524fd558580680","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-10-17T17:58:37Z","title_canon_sha256":"5fdd467fda2d0c8f5f6d4b978c5f95603eb588043dd172c2860ea2b4301e3512"},"schema_version":"1.0","source":{"id":"2410.13848","kind":"arxiv","version":1}},"canonical_sha256":"e43b7a033bdf1c25d4c98cc786d8c6be3f0a249cee13c37b6adeec1632aa6689","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"e43b7a033bdf1c25d4c98cc786d8c6be3f0a249cee13c37b6adeec1632aa6689","first_computed_at":"2026-05-17T23:38:49.991947Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.991947Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"DBZHmxwTahZ5s6Lem4K5IbEgQP+Yj2BynNUvm5kXkKENtAY+juRJ0QSG7jzqEeKtvGhvguoXKyqgZyLeLKGKBQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.992425Z","signed_message":"canonical_sha256_bytes"},"source_id":"2410.13848","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:3522ea6adbe059da539b6bcb205748aaeee6de695bec0aa174e5afa1bf4d4393","sha256:e04a3b0cb4103915b3e4bddab419c94b74c129b204700d3ebdc4025918c68c9d"],"state_sha256":"c6cbfb6c6495b0a8c9742098a5f717bbc4616e8cd2ccc8a348aef4ce92c607bd"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"59Yd8lXyt3nXdbLA/IKiEA4w+gfUHVV2wv/Ze9J6IpS1D0Sszx0IOLnX85iT+sTw68P5iHzYT/JvKuKavbRgDQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T10:02:34.300456Z","bundle_sha256":"2cc26ee96a9eecc36b30ce8c94ae669ca8077ab1cb536fcb6cf80b7ecfaf9089"}}