{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:KKLSRPH4474HO5EJ6H5QDCT6FE","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"00db5d85e5165711255697223170b23f4b7cf47aeef3efc6ea48f8aed1397dd8","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-27T17:59:56Z","title_canon_sha256":"2b180adec08e74911f06561a9ed78579237c978d0e1d663a766bb5fb5141dba3"},"schema_version":"1.0","source":{"id":"2604.24764","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.24764","created_at":"2026-05-21T01:05:19Z"},{"alias_kind":"arxiv_version","alias_value":"2604.24764v2","created_at":"2026-05-21T01:05:19Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.24764","created_at":"2026-05-21T01:05:19Z"},{"alias_kind":"pith_short_12","alias_value":"KKLSRPH4474H","created_at":"2026-05-21T01:05:19Z"},{"alias_kind":"pith_short_16","alias_value":"KKLSRPH4474HO5EJ","created_at":"2026-05-21T01:05:19Z"},{"alias_kind":"pith_short_8","alias_value":"KKLSRPH4","created_at":"2026-05-21T01:05:19Z"}],"graph_snapshots":[{"event_id":"sha256:313334b06db948eaa9caf49844fdc9bb2c7374dfd53b8e191da17a15ec4010a5","target":"graph","created_at":"2026-05-21T01:05:19Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Utilizing Flow-GRPO, we optimize the model using feedback from pre-trained 3D foundation models and vision-language models to enforce structural coherence without altering the underlying architecture... Extensive evaluations reveal that our approach significantly enhances 3D consistency while preserving the original visual quality of the foundation model."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That feedback signals from pre-trained 3D foundation models and vision-language models provide reliable, unbiased measures of structural coherence that translate directly to improved video generation."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"World-R1 uses RL with 3D model feedback and a new text dataset to improve geometric consistency in text-to-video generation while keeping the base model unchanged."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Reinforcement learning with feedback from 3D models enforces geometric consistency in text-to-video generation without changing the base architecture."}],"snapshot_sha256":"89755c129c2fcece1877d6bb87db129efdf57356afb64fc41958573e27d91e5e"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T21:44:27.559788Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2604.24764/integrity.json","findings":[],"snapshot_sha256":"7ce7aa486fa568913c153c85895415147856a50453a51d84422791734fd2285b","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Recent video foundation models demonstrate impressive visual synthesis but frequently suffer from geometric inconsistencies. While existing methods attempt to inject 3D priors via architectural modifications, they often incur high computational costs and limit scalability. We propose World-R1, a framework that aligns video generation with 3D constraints through reinforcement learning. To facilitate this alignment, we introduce a specialized pure text dataset tailored for world simulation. Utilizing Flow-GRPO, we optimize the model using feedback from pre-trained 3D foundation models and vision","authors_text":"Bohan Zhuang, Donny Y. Chen, Weijie Wang, Xiaoxuan He, Xirui Hu, Yanbo Ding, Yefei He, Yifan Yang, Youping Gu, Yuqing Yang, Zeyu Zhang, Zhiyuan He","cross_cats":[],"headline":"Reinforcement learning with feedback from 3D models enforces geometric consistency in text-to-video generation without changing the base architecture.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-27T17:59:56Z","title":"World-R1: Reinforcing 3D Constraints for Text-to-Video Generation"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2604.24764","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-08T04:23:12.305647Z","id":"4d52fb4a-b096-4ca0-b634-19609ea79f20","model_set":{"reader":"grok-4.3"},"one_line_summary":"World-R1 uses RL with 3D model feedback and a new text dataset to improve geometric consistency in text-to-video generation while keeping the base model unchanged.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Reinforcement learning with feedback from 3D models enforces geometric consistency in text-to-video generation without changing the base architecture.","strongest_claim":"Utilizing Flow-GRPO, we optimize the model using feedback from pre-trained 3D foundation models and vision-language models to enforce structural coherence without altering the underlying architecture... Extensive evaluations reveal that our approach significantly enhances 3D consistency while preserving the original visual quality of the foundation model.","weakest_assumption":"That feedback signals from pre-trained 3D foundation models and vision-language models provide reliable, unbiased measures of structural coherence that translate directly to improved video generation."}},"verdict_id":"4d52fb4a-b096-4ca0-b634-19609ea79f20"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2c3b3233e6297eb63c40ecf0b6e6416809bd76d2e65b08b60316bb95218b63f8","target":"record","created_at":"2026-05-21T01:05:19Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"00db5d85e5165711255697223170b23f4b7cf47aeef3efc6ea48f8aed1397dd8","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-27T17:59:56Z","title_canon_sha256":"2b180adec08e74911f06561a9ed78579237c978d0e1d663a766bb5fb5141dba3"},"schema_version":"1.0","source":{"id":"2604.24764","kind":"arxiv","version":2}},"canonical_sha256":"529728bcfce7f8777489f1fb018a7e29199d2ccacdf51b57c12bef710d506bd1","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"529728bcfce7f8777489f1fb018a7e29199d2ccacdf51b57c12bef710d506bd1","first_computed_at":"2026-05-21T01:05:19.520643Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-21T01:05:19.520643Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"dCvZt1rbxiUOUl+GhLlKeZs0m/MP/PkhqEIjvEm94u7UAc/hCbdW2EzyQ3m19R4yi+3cybUp3MctfiEg93NOAg==","signature_status":"signed_v1","signed_at":"2026-05-21T01:05:19.521055Z","signed_message":"canonical_sha256_bytes"},"source_id":"2604.24764","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2c3b3233e6297eb63c40ecf0b6e6416809bd76d2e65b08b60316bb95218b63f8","sha256:313334b06db948eaa9caf49844fdc9bb2c7374dfd53b8e191da17a15ec4010a5"],"state_sha256":"f81a2f68e57c3ffeffdc94cf246010cecc98ec293fde61cd6b1d2033199dbb67"}