{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:LWVTT7QDM7U2MZKY67CFNOK2OO","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"87a6d0d45f33b663733e1d3ccab4840f56fea1a808eabf19e20b21ee3d318aa3","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-10-10T17:59:58Z","title_canon_sha256":"b46c56aa5c0f7276e4ddc1686d851d012fd82e8fe2abf4e0a3fb109d49914448"},"schema_version":"1.0","source":{"id":"2510.09608","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2510.09608","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2510.09608v1","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.09608","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"LWVTT7QDM7U2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"LWVTT7QDM7U2MZKY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"LWVTT7QD","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:98a72a05711a0c1e49035d681fe426afda19675845caddf2820bd0f504d60fcd","target":"graph","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"On Inf-Streams-Eval, StreamingVLM achieves a 66.18% win rate against GPT-4O mini and maintains stable, real-time performance at up to 8 FPS on a single NVIDIA H100."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That supervised fine-tuning with full attention on short overlapped video chunks will produce stable coherence and performance when the same model is later run with the streaming KV cache on arbitrarily long, non-overlapped video streams."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"StreamingVLM enables stable real-time understanding of infinite video streams at up to 8 FPS using a streaming KV cache and aligned SFT on overlapped chunks, with a 66.18% win rate over GPT-4O mini on a new two-hour video benchmark."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A vision-language model achieves stable real-time understanding of arbitrarily long video streams through a streaming attention cache aligned with training on short clips."}],"snapshot_sha256":"5648812ab5122e4765157cc167940d09c1e98fbbaa513d195acd690b4165da07"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"4fe17a9627caac04e6a62ee27aa9201af505c751465ad4d493cba9a9d27fae09"},"paper":{"abstract_excerpt":"Vision-language models (VLMs) could power real-time assistants and autonomous agents, but they face a critical challenge: understanding near-infinite video streams without escalating latency and memory usage. Processing entire videos with full attention leads to quadratic computational costs and poor performance on long videos. Meanwhile, simple sliding window methods are also flawed, as they either break coherence or suffer from high latency due to redundant recomputation. In this paper, we introduce StreamingVLM, a model designed for real-time, stable understanding of infinite visual input. ","authors_text":"Guangxuan Xiao, Kelly Peng, Liuning He, Ruyi Xu, Song Han, Yao Lu, Yukang Chen","cross_cats":["cs.AI","cs.CL"],"headline":"A vision-language model achieves stable real-time understanding of arbitrarily long video streams through a streaming attention cache aligned with training on short clips.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-10-10T17:59:58Z","title":"StreamingVLM: Real-Time Understanding for Infinite Video Streams"},"references":{"count":12,"internal_anchors":8,"resolved_work":12,"sample":[{"cited_arxiv_id":"2502.13923","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","year":null},{"cited_arxiv_id":"2406.07476","doi":"","is_internal_anchor":true,"ref_index":2,"title":"VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs","work_id":"ccfc3f89-c510-45f1-8a35-ed1a56c0ae5c","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"arXiv preprint arXiv:2503.00540 , year=","work_id":"f9b28e0b-f48b-484b-a271-22ecec990b86","year":null},{"cited_arxiv_id":"2402.13753","doi":"","is_internal_anchor":true,"ref_index":4,"title":"LongRoPE: Extending LLM Context Window Beyond 2 Million Tokens","work_id":"41fabff1-11da-43da-8efd-2eb55186b9f2","year":null},{"cited_arxiv_id":"2405.21075","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-modal LLMs in Video Analysis","work_id":"77fd5ac9-ae98-4846-9d83-e9c73c8f2a52","year":null}],"snapshot_sha256":"511e5fcf9d8e0de4ded0821cc6f59a4d9b37d2f04771beddcde774589ad51a78"},"source":{"id":"2510.09608","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-17T11:47:01.498125Z","id":"544ab5f3-bc89-4a9c-af69-1225c41a838d","model_set":{"reader":"grok-4.3"},"one_line_summary":"StreamingVLM enables stable real-time understanding of infinite video streams at up to 8 FPS using a streaming KV cache and aligned SFT on overlapped chunks, with a 66.18% win rate over GPT-4O mini on a new two-hour video benchmark.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A vision-language model achieves stable real-time understanding of arbitrarily long video streams through a streaming attention cache aligned with training on short clips.","strongest_claim":"On Inf-Streams-Eval, StreamingVLM achieves a 66.18% win rate against GPT-4O mini and maintains stable, real-time performance at up to 8 FPS on a single NVIDIA H100.","weakest_assumption":"That supervised fine-tuning with full attention on short overlapped video chunks will produce stable coherence and performance when the same model is later run with the streaming KV cache on arbitrarily long, non-overlapped video streams."}},"verdict_id":"544ab5f3-bc89-4a9c-af69-1225c41a838d"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:8e9ea4cdc4184aa8db108c9e8591ebdece51b6e730dd0bd6c073cb9090ba3bcf","target":"record","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"87a6d0d45f33b663733e1d3ccab4840f56fea1a808eabf19e20b21ee3d318aa3","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-10-10T17:59:58Z","title_canon_sha256":"b46c56aa5c0f7276e4ddc1686d851d012fd82e8fe2abf4e0a3fb109d49914448"},"schema_version":"1.0","source":{"id":"2510.09608","kind":"arxiv","version":1}},"canonical_sha256":"5dab39fe0367e9a66558f7c456b95a738cc15ff70419b293c2e5ec8f7245c54c","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"5dab39fe0367e9a66558f7c456b95a738cc15ff70419b293c2e5ec8f7245c54c","first_computed_at":"2026-05-17T23:38:14.195787Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:14.195787Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"AwbssBV8OuJBsIJkz6hCpv+24Gh+ODYaN3jn9bFI2uf+mI5ogG4Fq3FeFIVTFLQpIWsjreGcepfsQ9Q5TZ0kDw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:14.196276Z","signed_message":"canonical_sha256_bytes"},"source_id":"2510.09608","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:8e9ea4cdc4184aa8db108c9e8591ebdece51b6e730dd0bd6c073cb9090ba3bcf","sha256:98a72a05711a0c1e49035d681fe426afda19675845caddf2820bd0f504d60fcd"],"state_sha256":"b0e54bf0daf44a7e00b3e1cf652fa1d33029c7c08a7c4d5610d0772a638d1cba"}