{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:L3NA6IVRNT557KRRZYRICUT76V","short_pith_number":"pith:L3NA6IVR","schema_version":"1.0","canonical_sha256":"5eda0f22b16cfbdfaa31ce2281527ff57f9b9bd32944180336922f00e088c04d","source":{"kind":"arxiv","id":"2605.14487","version":1},"attestation_state":"computed","paper":{"title":"Head Forcing: Long Autoregressive Video Generation via Head Heterogeneity","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Attention heads in autoregressive video diffusion transformers naturally divide into local, anchor, and memory roles, enabling a training-free Head Forcing method to generate minute-long videos by assigning each type specialized KV cache策略.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Chi Zhang, Gang Yu, Jiahao Tian, Yiwei Wang","submitted_at":"2026-05-14T07:27:39Z","abstract_excerpt":"Autoregressive video diffusion models support real-time synthesis but suffer from error accumulation and context loss over long horizons. We discover that attention heads in AR video diffusion transformers serve functionally distinct roles as local heads for detail refinement, anchor heads for structural stabilization, and memory heads for long-range context aggregation, yet existing methods treat them uniformly, leading to suboptimal KV cache allocation. We propose Head Forcing, a training-free framework that assigns each head type a tailored KV cache strategy: local and anchor heads retain o"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.14487","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T07:27:39Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"37768ef9866f318c71d0fcbfc4e35c6f2acaca235c13e38eeda1ee854f4d176b","abstract_canon_sha256":"1d4ec4f37b71cade6bb4662ffbf739d1161d570f08b28cb2cb42a080a6ea2a43"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:06.478722Z","signature_b64":"w8w7OY44SPPYcBHUd8mqmJ7x9HpOPXc2OOv4AeQ4mO5Xfc3A+cVFKggPY+lWdaV0Gc0Oqopqb8saZIZISQ2DDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5eda0f22b16cfbdfaa31ce2281527ff57f9b9bd32944180336922f00e088c04d","last_reissued_at":"2026-05-17T23:39:06.477970Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:06.477970Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Head Forcing: Long Autoregressive Video Generation via Head Heterogeneity","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Attention heads in autoregressive video diffusion transformers naturally divide into local, anchor, and memory roles, enabling a training-free Head Forcing method to generate minute-long videos by assigning each type specialized KV cache策略.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Chi Zhang, Gang Yu, Jiahao Tian, Yiwei Wang","submitted_at":"2026-05-14T07:27:39Z","abstract_excerpt":"Autoregressive video diffusion models support real-time synthesis but suffer from error accumulation and context loss over long horizons. We discover that attention heads in AR video diffusion transformers serve functionally distinct roles as local heads for detail refinement, anchor heads for structural stabilization, and memory heads for long-range context aggregation, yet existing methods treat them uniformly, leading to suboptimal KV cache allocation. We propose Head Forcing, a training-free framework that assigns each head type a tailored KV cache strategy: local and anchor heads retain o"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Without additional training, Head Forcing extends generation from 5 seconds to minute-level duration, supports multi-prompt interactive synthesis, and consistently outperforms existing baselines.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That attention heads in AR video diffusion transformers naturally and reliably fall into distinct functional categories (local for detail refinement, anchor for structural stabilization, memory for long-range context) that can be identified and assigned effective tailored KV cache strategies without any model-specific training or validation.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Head Forcing assigns tailored KV cache strategies to local, anchor, and memory attention heads plus head-wise RoPE re-encoding to extend autoregressive video generation from seconds to minutes without training.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Attention heads in autoregressive video diffusion transformers naturally divide into local, anchor, and memory roles, enabling a training-free Head Forcing method to generate minute-long videos by assigning each type specialized KV cache策略.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"0ad5ce4c9a58b99e03bb7899b410c75576f152ba1a867edc20dfa7900e4ae18b"},"source":{"id":"2605.14487","kind":"arxiv","version":1},"verdict":{"id":"b75fc88b-d844-4812-a5a2-3c9b79dc1d69","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T02:25:13.397502Z","strongest_claim":"Without additional training, Head Forcing extends generation from 5 seconds to minute-level duration, supports multi-prompt interactive synthesis, and consistently outperforms existing baselines.","one_line_summary":"Head Forcing assigns tailored KV cache strategies to local, anchor, and memory attention heads plus head-wise RoPE re-encoding to extend autoregressive video generation from seconds to minutes without training.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That attention heads in AR video diffusion transformers naturally and reliably fall into distinct functional categories (local for detail refinement, anchor for structural stabilization, memory for long-range context) that can be identified and assigned effective tailored KV cache strategies without any model-specific training or validation.","pith_extraction_headline":"Attention heads in autoregressive video diffusion transformers naturally divide into local, anchor, and memory roles, enabling a training-free Head Forcing method to generate minute-long videos by assigning each type specialized KV cache策略."},"references":{"count":73,"sample":[{"doi":"","year":2023,"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","ref_index":1,"cited_arxiv_id":"2311.15127","is_internal_anchor":true},{"doi":"","year":2023,"title":"In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition","work_id":"fe5ade26-6dc9-45fd-955e-81ae38b92d13","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"In: Forty-first International Conference on Machine Learning (2024)","work_id":"15150cbf-5929-41c1-a8fc-bec88e50d702","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.48550/arxiv.2508","year":2025,"title":"arXiv:2508.03841 (2025).https://doi.org/10.48550/arXiv.2508","work_id":"526cfa63-f619-4756-bc55-b880c6e77afc","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Advances in Neural Information Processing Systems37, 24081–24125 (2024)","work_id":"3983bafd-9763-4b5b-a3a3-0a48efd94967","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":73,"snapshot_sha256":"60fab96f5edbbaf217a63f47777bad313df042be96f91469d978521f645bed91","internal_anchors":26},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.14487","created_at":"2026-05-17T23:39:06.478098+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.14487v1","created_at":"2026-05-17T23:39:06.478098+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14487","created_at":"2026-05-17T23:39:06.478098+00:00"},{"alias_kind":"pith_short_12","alias_value":"L3NA6IVRNT55","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"L3NA6IVRNT557KRR","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"L3NA6IVR","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/L3NA6IVRNT557KRRZYRICUT76V","json":"https://pith.science/pith/L3NA6IVRNT557KRRZYRICUT76V.json","graph_json":"https://pith.science/api/pith-number/L3NA6IVRNT557KRRZYRICUT76V/graph.json","events_json":"https://pith.science/api/pith-number/L3NA6IVRNT557KRRZYRICUT76V/events.json","paper":"https://pith.science/paper/L3NA6IVR"},"agent_actions":{"view_html":"https://pith.science/pith/L3NA6IVRNT557KRRZYRICUT76V","download_json":"https://pith.science/pith/L3NA6IVRNT557KRRZYRICUT76V.json","view_paper":"https://pith.science/paper/L3NA6IVR","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.14487&json=true","fetch_graph":"https://pith.science/api/pith-number/L3NA6IVRNT557KRRZYRICUT76V/graph.json","fetch_events":"https://pith.science/api/pith-number/L3NA6IVRNT557KRRZYRICUT76V/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/L3NA6IVRNT557KRRZYRICUT76V/action/timestamp_anchor","attest_storage":"https://pith.science/pith/L3NA6IVRNT557KRRZYRICUT76V/action/storage_attestation","attest_author":"https://pith.science/pith/L3NA6IVRNT557KRRZYRICUT76V/action/author_attestation","sign_citation":"https://pith.science/pith/L3NA6IVRNT557KRRZYRICUT76V/action/citation_signature","submit_replication":"https://pith.science/pith/L3NA6IVRNT557KRRZYRICUT76V/action/replication_record"}},"created_at":"2026-05-17T23:39:06.478098+00:00","updated_at":"2026-05-17T23:39:06.478098+00:00"}