{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:GHNBNG4UEWLWPVU2NRB2RO2EY3","merge_version":"pith-open-graph-merge-v1","event_count":4,"valid_event_count":4,"invalid_event_count":0,"equivocation_count":1,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"eef6e0529d268c02237d963884733f3ac70a5b621928b74e9f4c12eaa73e4cd1","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:58:03Z","title_canon_sha256":"ac7f02e2b426f77376b0a94085db9f39f21a7eac26efeb50d349b13165daa19a"},"schema_version":"1.0","source":{"id":"2605.15178","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15178","created_at":"2026-05-17T21:18:32Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15178v1","created_at":"2026-05-17T21:18:32Z"},{"alias_kind":"pith_short_12","alias_value":"GHNBNG4UEWLW","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"GHNBNG4UEWLWPVU2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"GHNBNG4U","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:03ac14979a0cda6bc8e8681ab25161cb0a495932b11db8df55b8554805efdbf3","target":"graph","created_at":"2026-05-17T21:57:18Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"SANA-WM achieves visual quality comparable to large-scale industrial baselines such as LingBot-World and HY-WorldPlay, while significantly improving efficiency... On our one-minute world-model benchmark, SANA-WM demonstrates stronger action-following accuracy than prior open-source baselines and achieves comparable visual quality at 36× higher throughput for scalable world modeling."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The robust annotation pipeline extracts accurate metric-scale 6-DoF camera poses from public videos to yield high-quality, spatiotemporally consistent action labels that enable effective training of the world model."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SANA-WM is a 2.6B-parameter efficient world model that synthesizes minute-scale 720p videos with 6-DoF camera control, trained on 213K public clips in 15 days on 64 H100s and runnable on single GPUs at 36x higher throughput than prior open baselines."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"SANA-WM generates minute-scale 720p videos with camera control at 36 times higher throughput than prior open-source models."}],"snapshot_sha256":"55dd0e6b4c1d1b95062655e9933b87c866f4fab94d32b2de282676097a8c805f"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"We introduce SANA-WM, an efficient 2.6B-parameter open-source world model natively trained for one-minute generation, synthesizing high-fidelity, 720p, minute-scale videos with precise camera control. SANA-WM achieves visual quality comparable to large-scale industrial baselines such as LingBot-World and HY-WorldPlay, while significantly improving efficiency. Four core designs drive our architecture: (1) Hybrid Linear Attention combines frame-wise Gated DeltaNet (GDN) with softmax attention for memory-efficient long-context modeling. (2) Dual-Branch Camera Control ensures precise 6-DoF traject","authors_text":"Enze Xie, Haoyi Zhu, Haozhe Liu, Jincheng Yu, Junsong Chen, Song Han, Tian Ye, Tong He, Yuyang Zhao","cross_cats":[],"headline":"SANA-WM generates minute-scale 720p videos with camera control at 36 times higher throughput than prior open-source models.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:58:03Z","title":"SANA-WM: Efficient Minute-Scale World Modeling with Hybrid Linear Diffusion Transformer"},"references":{"count":102,"internal_anchors":46,"resolved_work":102,"sample":[{"cited_arxiv_id":"1803.10122","doi":"","is_internal_anchor":true,"ref_index":1,"title":"World Models","work_id":"07227eee-8445-4c98-bce4-c6a6fd5ed907","year":2018},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Genie 3: A new frontier for world models","work_id":"94eb34fb-202e-47c8-bd7e-5e1c88ff88ae","year":2025},{"cited_arxiv_id":"2309.17080","doi":"","is_internal_anchor":true,"ref_index":3,"title":"GAIA-1: A Generative World Model for Autonomous Driving","work_id":"313484e6-a442-4522-8e19-d07e502844a8","year":2023},{"cited_arxiv_id":"2602.06949","doi":"","is_internal_anchor":true,"ref_index":4,"title":"DreamDojo: A Generalist Robot World Model from Large-Scale Human Videos","work_id":"95f2f415-c659-4084-a008-39303bea8638","year":2026},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Aether: Geometric-aware unified world modeling","work_id":"3aecc6ce-b828-4cf4-8553-7b25b9e2051c","year":2025}],"snapshot_sha256":"c904497025477ff38bfb7bb1e27b222f9c4bf279206dc36796e2646b766a77b4"},"source":{"id":"2605.15178","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T13:52:18.797742Z","id":"f6f23a26-71f1-4e50-9451-a62bfc91b10d","model_set":{"reader":"grok-4.3"},"one_line_summary":"SANA-WM is a 2.6B-parameter efficient world model that synthesizes minute-scale 720p videos with 6-DoF camera control, trained on 213K public clips in 15 days on 64 H100s and runnable on single GPUs at 36x higher throughput than prior open baselines.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"SANA-WM generates minute-scale 720p videos with camera control at 36 times higher throughput than prior open-source models.","strongest_claim":"SANA-WM achieves visual quality comparable to large-scale industrial baselines such as LingBot-World and HY-WorldPlay, while significantly improving efficiency... On our one-minute world-model benchmark, SANA-WM demonstrates stronger action-following accuracy than prior open-source baselines and achieves comparable visual quality at 36× higher throughput for scalable world modeling.","weakest_assumption":"The robust annotation pipeline extracts accurate metric-scale 6-DoF camera poses from public videos to yield high-quality, spatiotemporally consistent action labels that enable effective training of the world model."}},"verdict_id":"f6f23a26-71f1-4e50-9451-a62bfc91b10d"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:e514e0a9896a9ff55c8e06090097adbb7602f629fe3eb6a3775a18ef4297af41","target":"record","created_at":"2026-05-17T21:18:32Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"eef6e0529d268c02237d963884733f3ac70a5b621928b74e9f4c12eaa73e4cd1","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:58:03Z","title_canon_sha256":"ac7f02e2b426f77376b0a94085db9f39f21a7eac26efeb50d349b13165daa19a"},"schema_version":"1.0","source":{"id":"2605.15178","kind":"arxiv","version":1}},"canonical_sha256":"31da169b94259767d69a6c43a8bb44c6d0d47bdb22323b83a8eea4144ede5b37","receipt":{"builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"31da169b94259767d69a6c43a8bb44c6d0d47bdb22323b83a8eea4144ede5b37","first_computed_at":"2026-05-17T21:40:25.203865Z","kind":"pith_receipt","last_reissued_at":"2026-05-17T21:57:18.558439Z","receipt_version":"0.2","signature_status":"unsigned_v0"},"source_id":"2605.15178","source_kind":"arxiv","source_version":1}}},"equivocations":[{"signer_id":"pith.science","event_type":"integrity_finding","target":"integrity","event_ids":["sha256:1ad87288d5eff31cc79aebc69197b1a63cb93bff65b8a300e5203158a64a0c84","sha256:4c134ca032d7128d3403c36739c5cc65b76f94a256686506dfef48af848efea2"]}],"invalid_events":[],"applied_event_ids":["sha256:e514e0a9896a9ff55c8e06090097adbb7602f629fe3eb6a3775a18ef4297af41","sha256:03ac14979a0cda6bc8e8681ab25161cb0a495932b11db8df55b8554805efdbf3"],"state_sha256":"c078e1a27d10c777b1c9a937cd263a9b1a8c670afdde92e591181aaf76303030"}