{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:M3RVYXKXQDDZVHKCOXA6IXD67C","short_pith_number":"pith:M3RVYXKX","canonical_record":{"source":{"id":"2605.04128","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.GR","submitted_at":"2026-05-05T15:49:47Z","cross_cats_sorted":["cs.AI","cs.CL","cs.CV","cs.LG"],"title_canon_sha256":"c757212ec4e5bd0c5344229b9b2d2eafbed426984c9e69a73a779487d034d3e5","abstract_canon_sha256":"fba3f1cab1d8a7e0787b38adf5394d3a25b98caa8c5a44ce57858e90ff64f3c0"},"schema_version":"1.0"},"canonical_sha256":"66e35c5d5780c79a9d4275c1e45c7ef8910a6bcc5f0287ae2ba4456f9ec32b41","source":{"kind":"arxiv","id":"2605.04128","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.04128","created_at":"2026-05-21T01:05:20Z"},{"alias_kind":"arxiv_version","alias_value":"2605.04128v2","created_at":"2026-05-21T01:05:20Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.04128","created_at":"2026-05-21T01:05:20Z"},{"alias_kind":"pith_short_12","alias_value":"M3RVYXKXQDDZ","created_at":"2026-05-21T01:05:20Z"},{"alias_kind":"pith_short_16","alias_value":"M3RVYXKXQDDZVHKC","created_at":"2026-05-21T01:05:20Z"},{"alias_kind":"pith_short_8","alias_value":"M3RVYXKX","created_at":"2026-05-21T01:05:20Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:M3RVYXKXQDDZVHKCOXA6IXD67C","target":"record","payload":{"canonical_record":{"source":{"id":"2605.04128","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.GR","submitted_at":"2026-05-05T15:49:47Z","cross_cats_sorted":["cs.AI","cs.CL","cs.CV","cs.LG"],"title_canon_sha256":"c757212ec4e5bd0c5344229b9b2d2eafbed426984c9e69a73a779487d034d3e5","abstract_canon_sha256":"fba3f1cab1d8a7e0787b38adf5394d3a25b98caa8c5a44ce57858e90ff64f3c0"},"schema_version":"1.0"},"canonical_sha256":"66e35c5d5780c79a9d4275c1e45c7ef8910a6bcc5f0287ae2ba4456f9ec32b41","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T01:05:20.118811Z","signature_b64":"6lRTS3aDdSmIsFs6cD6S8mOvaXLDQHVRUtxIhXKj+mrjiNQffcF5v6c2eSLX3IMwtgIbyMDvEAQGz/+HsQo0Ag==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"66e35c5d5780c79a9d4275c1e45c7ef8910a6bcc5f0287ae2ba4456f9ec32b41","last_reissued_at":"2026-05-21T01:05:20.118154Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T01:05:20.118154Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.04128","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-21T01:05:20Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"lirb7rt1/rHVLu8nMmUzta89SknIV1bnq2bJkXdz2opZcelCS7gQARYLk4NGyo/fsVUwS/OHPyhENJrdMKHpCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-03T08:23:11.274729Z"},"content_sha256":"1cd242e2e3dd142ec3bdbc8420163a60c72c23481264b7f476df4ede5a9b088c","schema_version":"1.0","event_id":"sha256:1cd242e2e3dd142ec3bdbc8420163a60c72c23481264b7f476df4ede5a9b088c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:M3RVYXKXQDDZVHKCOXA6IXD67C","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"JoyAI-Image: Awaking Spatial Intelligence in Unified Multimodal Understanding and Generation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A bidirectional loop between understanding, spatial editing, and novel-view reasoning strengthens spatial intelligence beyond general visual tasks.","cross_cats":["cs.AI","cs.CL","cs.CV","cs.LG"],"primary_cat":"cs.GR","authors_text":"Bo Wang, Guohui Zhang, Guoqing Ma, Hang Xu, Haoyang Huang, Haoze Sun, Jianhui Liu, Lin Song, Maoquan Zhang, Nan Duan, Nan Jiang, Wei Tang, Wenbo Li, Wenhu Zhang, Xin Han, Yanbing Zhang, Yicheng Xiao, Yijun Yang, Yuan Zhang","submitted_at":"2026-05-05T15:49:47Z","abstract_excerpt":"We present JoyAI-Image, a unified multimodal foundation model for visual understanding, text-to-image generation, and instruction-guided image editing. JoyAI-Image couples a spatially enhanced Multimodal Large Language Model (MLLM) with a Multimodal Diffusion Transformer (MMDiT), allowing perception and generation to interact through a shared multimodal interface. Around this architecture, we build a scalable training recipe that combines unified instruction tuning, long-text rendering supervision, spatially grounded data, and both general and spatial editing signals. This design gives the mod"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"the bidirectional loop between enhanced understanding, controllable spatial editing, and novel-view-assisted reasoning enables the model to move beyond general visual competence toward stronger spatial intelligence.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the described architecture and training recipe (unified instruction tuning, spatially grounded data, and editing signals) actually produce measurable gains in spatial intelligence beyond what separate models achieve, as no specific metrics, baselines, or ablation results are provided in the abstract.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"JoyAI-Image unifies visual understanding, generation, and editing in one model and claims stronger spatial intelligence through bidirectional perception-generation loops.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A bidirectional loop between understanding, spatial editing, and novel-view reasoning strengthens spatial intelligence beyond general visual tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"e3c28712c06f1482531cba41db65c0931cc1a44084aad8be7da82fe1c2cf9d29"},"source":{"id":"2605.04128","kind":"arxiv","version":2},"verdict":{"id":"e9eb65f5-acc6-4857-b537-b03f032b9eb2","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-08T17:51:51.853066Z","strongest_claim":"the bidirectional loop between enhanced understanding, controllable spatial editing, and novel-view-assisted reasoning enables the model to move beyond general visual competence toward stronger spatial intelligence.","one_line_summary":"JoyAI-Image unifies visual understanding, generation, and editing in one model and claims stronger spatial intelligence through bidirectional perception-generation loops.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the described architecture and training recipe (unified instruction tuning, spatially grounded data, and editing signals) actually produce measurable gains in spatial intelligence beyond what separate models achieve, as no specific metrics, baselines, or ablation results are provided in the abstract.","pith_extraction_headline":"A bidirectional loop between understanding, spatial editing, and novel-view reasoning strengthens spatial intelligence beyond general visual tasks."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.04128/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"ai_meta_artifact","ran_at":"2026-05-20T12:41:25.614049Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_title_agreement","ran_at":"2026-05-20T00:01:21.662496Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T14:57:42.646567Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"f5391a8ac1679374e8132fd9029044260e8c72265c11fa36bd4c686febb1773c"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":3,"snapshot_sha256":"ca4217b742c0d72e4347b17192ac55e9077f20c8573e1b54746014cf4ff0b37a"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"e9eb65f5-acc6-4857-b537-b03f032b9eb2"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-21T01:05:20Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"qNstDpZMXNSCl5Q79aBORO9YXQC51AoTTknd19MVOz76PttR/UhRqJisuymYQKtKpu3IKNwRyNZ34E2pNCT3CA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-03T08:23:11.275225Z"},"content_sha256":"a5989e40c5cf560b5d626090b7cb41a6ac0b568491626f4cf960ecc40d6a9791","schema_version":"1.0","event_id":"sha256:a5989e40c5cf560b5d626090b7cb41a6ac0b568491626f4cf960ecc40d6a9791"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/M3RVYXKXQDDZVHKCOXA6IXD67C/bundle.json","state_url":"https://pith.science/pith/M3RVYXKXQDDZVHKCOXA6IXD67C/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/M3RVYXKXQDDZVHKCOXA6IXD67C/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-03T08:23:11Z","links":{"resolver":"https://pith.science/pith/M3RVYXKXQDDZVHKCOXA6IXD67C","bundle":"https://pith.science/pith/M3RVYXKXQDDZVHKCOXA6IXD67C/bundle.json","state":"https://pith.science/pith/M3RVYXKXQDDZVHKCOXA6IXD67C/state.json","well_known_bundle":"https://pith.science/.well-known/pith/M3RVYXKXQDDZVHKCOXA6IXD67C/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:M3RVYXKXQDDZVHKCOXA6IXD67C","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"fba3f1cab1d8a7e0787b38adf5394d3a25b98caa8c5a44ce57858e90ff64f3c0","cross_cats_sorted":["cs.AI","cs.CL","cs.CV","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.GR","submitted_at":"2026-05-05T15:49:47Z","title_canon_sha256":"c757212ec4e5bd0c5344229b9b2d2eafbed426984c9e69a73a779487d034d3e5"},"schema_version":"1.0","source":{"id":"2605.04128","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.04128","created_at":"2026-05-21T01:05:20Z"},{"alias_kind":"arxiv_version","alias_value":"2605.04128v2","created_at":"2026-05-21T01:05:20Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.04128","created_at":"2026-05-21T01:05:20Z"},{"alias_kind":"pith_short_12","alias_value":"M3RVYXKXQDDZ","created_at":"2026-05-21T01:05:20Z"},{"alias_kind":"pith_short_16","alias_value":"M3RVYXKXQDDZVHKC","created_at":"2026-05-21T01:05:20Z"},{"alias_kind":"pith_short_8","alias_value":"M3RVYXKX","created_at":"2026-05-21T01:05:20Z"}],"graph_snapshots":[{"event_id":"sha256:a5989e40c5cf560b5d626090b7cb41a6ac0b568491626f4cf960ecc40d6a9791","target":"graph","created_at":"2026-05-21T01:05:20Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"the bidirectional loop between enhanced understanding, controllable spatial editing, and novel-view-assisted reasoning enables the model to move beyond general visual competence toward stronger spatial intelligence."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the described architecture and training recipe (unified instruction tuning, spatially grounded data, and editing signals) actually produce measurable gains in spatial intelligence beyond what separate models achieve, as no specific metrics, baselines, or ablation results are provided in the abstract."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"JoyAI-Image unifies visual understanding, generation, and editing in one model and claims stronger spatial intelligence through bidirectional perception-generation loops."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A bidirectional loop between understanding, spatial editing, and novel-view reasoning strengthens spatial intelligence beyond general visual tasks."}],"snapshot_sha256":"e3c28712c06f1482531cba41db65c0931cc1a44084aad8be7da82fe1c2cf9d29"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"ca4217b742c0d72e4347b17192ac55e9077f20c8573e1b54746014cf4ff0b37a"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-20T12:41:25.614049Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-20T00:01:21.662496Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T14:57:42.646567Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2605.04128/integrity.json","findings":[],"snapshot_sha256":"f5391a8ac1679374e8132fd9029044260e8c72265c11fa36bd4c686febb1773c","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"We present JoyAI-Image, a unified multimodal foundation model for visual understanding, text-to-image generation, and instruction-guided image editing. JoyAI-Image couples a spatially enhanced Multimodal Large Language Model (MLLM) with a Multimodal Diffusion Transformer (MMDiT), allowing perception and generation to interact through a shared multimodal interface. Around this architecture, we build a scalable training recipe that combines unified instruction tuning, long-text rendering supervision, spatially grounded data, and both general and spatial editing signals. This design gives the mod","authors_text":"Bo Wang, Guohui Zhang, Guoqing Ma, Hang Xu, Haoyang Huang, Haoze Sun, Jianhui Liu, Lin Song, Maoquan Zhang, Nan Duan, Nan Jiang, Wei Tang, Wenbo Li, Wenhu Zhang, Xin Han, Yanbing Zhang, Yicheng Xiao, Yijun Yang, Yuan Zhang","cross_cats":["cs.AI","cs.CL","cs.CV","cs.LG"],"headline":"A bidirectional loop between understanding, spatial editing, and novel-view reasoning strengthens spatial intelligence beyond general visual tasks.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.GR","submitted_at":"2026-05-05T15:49:47Z","title":"JoyAI-Image: Awaking Spatial Intelligence in Unified Multimodal Understanding and Generation"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.04128","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-08T17:51:51.853066Z","id":"e9eb65f5-acc6-4857-b537-b03f032b9eb2","model_set":{"reader":"grok-4.3"},"one_line_summary":"JoyAI-Image unifies visual understanding, generation, and editing in one model and claims stronger spatial intelligence through bidirectional perception-generation loops.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A bidirectional loop between understanding, spatial editing, and novel-view reasoning strengthens spatial intelligence beyond general visual tasks.","strongest_claim":"the bidirectional loop between enhanced understanding, controllable spatial editing, and novel-view-assisted reasoning enables the model to move beyond general visual competence toward stronger spatial intelligence.","weakest_assumption":"That the described architecture and training recipe (unified instruction tuning, spatially grounded data, and editing signals) actually produce measurable gains in spatial intelligence beyond what separate models achieve, as no specific metrics, baselines, or ablation results are provided in the abstract."}},"verdict_id":"e9eb65f5-acc6-4857-b537-b03f032b9eb2"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:1cd242e2e3dd142ec3bdbc8420163a60c72c23481264b7f476df4ede5a9b088c","target":"record","created_at":"2026-05-21T01:05:20Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"fba3f1cab1d8a7e0787b38adf5394d3a25b98caa8c5a44ce57858e90ff64f3c0","cross_cats_sorted":["cs.AI","cs.CL","cs.CV","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.GR","submitted_at":"2026-05-05T15:49:47Z","title_canon_sha256":"c757212ec4e5bd0c5344229b9b2d2eafbed426984c9e69a73a779487d034d3e5"},"schema_version":"1.0","source":{"id":"2605.04128","kind":"arxiv","version":2}},"canonical_sha256":"66e35c5d5780c79a9d4275c1e45c7ef8910a6bcc5f0287ae2ba4456f9ec32b41","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"66e35c5d5780c79a9d4275c1e45c7ef8910a6bcc5f0287ae2ba4456f9ec32b41","first_computed_at":"2026-05-21T01:05:20.118154Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-21T01:05:20.118154Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"6lRTS3aDdSmIsFs6cD6S8mOvaXLDQHVRUtxIhXKj+mrjiNQffcF5v6c2eSLX3IMwtgIbyMDvEAQGz/+HsQo0Ag==","signature_status":"signed_v1","signed_at":"2026-05-21T01:05:20.118811Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.04128","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:1cd242e2e3dd142ec3bdbc8420163a60c72c23481264b7f476df4ede5a9b088c","sha256:a5989e40c5cf560b5d626090b7cb41a6ac0b568491626f4cf960ecc40d6a9791"],"state_sha256":"be7ec4ef6d585d0395e1244d697c89fdf2daf76e20138d5808b864a8b78f3f7f"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"vt4I9RMLK6jS++WbunSMmAT1HWKDagZvVDRjWGMpoaJGj1zgiH7RiiLEDyMHLmIBSXFUWszSTrvAwxLBl4I5Dg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-03T08:23:11.277588Z","bundle_sha256":"2321043ef1d3a5edd32014625b4f974d2bd3097bc4bd4f1b938e0eb29ab80f2c"}}