{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:OVTZMQ4L5I6FT2X6KD52IQZJDF","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7f67076de87788c69b47d5551c71b2d7952f4d9b071ccc3f97727c58fedf0259","cross_cats_sorted":["cs.CV"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-02-10T18:44:25Z","title_canon_sha256":"cd40cad7c6e5ff3cfb9fe443a7080dd443898d14479092a35d08ed647021426f"},"schema_version":"1.0","source":{"id":"2502.06764","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2502.06764","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2502.06764v2","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.06764","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"OVTZMQ4L5I6F","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"OVTZMQ4L5I6FT2X6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"OVTZMQ4L","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:979414eae13c4cb2e29951a67a8cb62379dee5edfe9e6dc68c92bfbefcc16504","target":"graph","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We propose the Diffusion Forcing Transformer (DFoT), a video diffusion architecture and theoretically grounded training objective that jointly enable conditioning on a flexible number of history frames. We then introduce History Guidance, a family of guidance methods uniquely enabled by DFoT."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the DFoT training objective and architecture truly support arbitrary-length history without hidden performance costs or instability, and that the proposed history guidance methods generalize beyond the tested datasets and lengths."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"DFoT enables flexible history conditioning in video diffusion, with history guidance methods that boost temporal consistency and support long rollouts."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Diffusion Forcing Transformer lets video models condition on any number of past frames."}],"snapshot_sha256":"ab9d39b15b60defad8a26c7b4abf729c9f014c06e4cf49b993c1309483d4729b"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Classifier-free guidance (CFG) is a key technique for improving conditional generation in diffusion models, enabling more accurate control while enhancing sample quality. It is natural to extend this technique to video diffusion, which generates video conditioned on a variable number of context frames, collectively referred to as history. However, we find two key challenges to guiding with variable-length history: architectures that only support fixed-size conditioning, and the empirical observation that CFG-style history dropout performs poorly. To address this, we propose the Diffusion Forci","authors_text":"Boyuan Chen, Kiwhan Song, Max Simchowitz, Russ Tedrake, Vincent Sitzmann, Yilun Du","cross_cats":["cs.CV"],"headline":"Diffusion Forcing Transformer lets video models condition on any number of past frames.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-02-10T18:44:25Z","title":"History-Guided Video Diffusion"},"references":{"count":70,"internal_anchors":20,"resolved_work":70,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"All are worth words: A vit backbone for diffusion models","work_id":"4b93ec35-06cf-40f1-8cbe-c6b896c36f19","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Bellec, P. C. Optimal exponential bounds for aggregation of density estimators. Bernoulli, 23 0 (1): 0 219--248, 2017","work_id":"f969585b-da22-44b3-b85a-b85fccaa9ca8","year":2017},{"cited_arxiv_id":"2311.15127","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"W., Fidler, S., and Kreis, K","work_id":"957f9e1a-a2b2-431e-bff6-aecffd524f53","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Video generation models as world simulators","work_id":"e020e1af-8964-4aaa-a232-2d7f0d16f6a4","year":2024}],"snapshot_sha256":"2043db7526c58d7266dfc948a0c5b0411fc12f24e5d1e59aeea3e77a61684eb4"},"source":{"id":"2502.06764","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T11:56:30.341937Z","id":"9276c95b-0f63-4762-9abf-327be7b61973","model_set":{"reader":"grok-4.3"},"one_line_summary":"DFoT enables flexible history conditioning in video diffusion, with history guidance methods that boost temporal consistency and support long rollouts.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Diffusion Forcing Transformer lets video models condition on any number of past frames.","strongest_claim":"We propose the Diffusion Forcing Transformer (DFoT), a video diffusion architecture and theoretically grounded training objective that jointly enable conditioning on a flexible number of history frames. We then introduce History Guidance, a family of guidance methods uniquely enabled by DFoT.","weakest_assumption":"That the DFoT training objective and architecture truly support arbitrary-length history without hidden performance costs or instability, and that the proposed history guidance methods generalize beyond the tested datasets and lengths."}},"verdict_id":"9276c95b-0f63-4762-9abf-327be7b61973"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:215aa975c7ca244e2d83e55197584c63cd7b4b84adb3bf41e5d4226757b45d30","target":"record","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7f67076de87788c69b47d5551c71b2d7952f4d9b071ccc3f97727c58fedf0259","cross_cats_sorted":["cs.CV"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-02-10T18:44:25Z","title_canon_sha256":"cd40cad7c6e5ff3cfb9fe443a7080dd443898d14479092a35d08ed647021426f"},"schema_version":"1.0","source":{"id":"2502.06764","kind":"arxiv","version":2}},"canonical_sha256":"756796438bea3c59eafe50fba44329197d7363b68df6f6c2c614f33ca7b2c00e","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"756796438bea3c59eafe50fba44329197d7363b68df6f6c2c614f33ca7b2c00e","first_computed_at":"2026-05-17T23:38:47.953184Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:47.953184Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"L0DGg4b4PtvgZW+Zrwc0z6vjlIzQJQwjBCsIudkQza9fA3SfqzcDG+3n0QWTdmo9bCzeqOAv1kMC05OfEd9yDQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:47.953804Z","signed_message":"canonical_sha256_bytes"},"source_id":"2502.06764","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:215aa975c7ca244e2d83e55197584c63cd7b4b84adb3bf41e5d4226757b45d30","sha256:979414eae13c4cb2e29951a67a8cb62379dee5edfe9e6dc68c92bfbefcc16504"],"state_sha256":"de16f35d0af632895ba84e7835214ee3fdb44a043f5b735529bef69372c07426"}