{"paper":{"title":"Streaming 4D Visual Geometry Transformer","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A causal streaming transformer reconstructs 3D geometry from video online by caching historical frame information.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CV","authors_text":"Dong Zhuo, Jiahe Guo, Jie Zhou, Jiwen Lu, Wenzhao Zheng, Yuqi Wu","submitted_at":"2025-07-15T17:59:57Z","abstract_excerpt":"Perceiving and reconstructing 3D geometry from videos is a fundamental yet challenging computer vision task. To facilitate interactive and low-latency applications, we propose a streaming visual geometry transformer that shares a similar philosophy with autoregressive large language models. We explore a simple and efficient design and employ a causal transformer architecture to process the input sequence in an online manner. We use temporal causal attention and cache the historical keys and values as implicit memory to enable efficient streaming long-term 3D reconstruction. This design can han"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our model enhances inference speed in online scenarios while maintaining competitive performance, thereby facilitating scalable and interactive 3D vision systems.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That knowledge distilled from the dense bidirectional VGGT model transfers effectively to the causal streaming architecture without losing critical spatial consistency over long sequences.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"A causal transformer with key-value caching and distillation from a bidirectional VGGT model enables efficient online 4D geometry reconstruction from videos.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A causal streaming transformer reconstructs 3D geometry from video online by caching historical frame information.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"42b5a3149488b3596ede80968ebda37e1804704b69b77b24234b04904b39ceeb"},"source":{"id":"2507.11539","kind":"arxiv","version":2},"verdict":{"id":"55fdb9db-55bb-4b51-be2b-cb1e642af680","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T22:54:47.463876Z","strongest_claim":"Our model enhances inference speed in online scenarios while maintaining competitive performance, thereby facilitating scalable and interactive 3D vision systems.","one_line_summary":"A causal transformer with key-value caching and distillation from a bidirectional VGGT model enables efficient online 4D geometry reconstruction from videos.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That knowledge distilled from the dense bidirectional VGGT model transfers effectively to the causal streaming architecture without losing critical spatial consistency over long sequences.","pith_extraction_headline":"A causal streaming transformer reconstructs 3D geometry from video online by caching historical frame information."},"references":{"count":16,"sample":[{"doi":"","year":null,"title":"ARKitScenes: A Diverse Real-World Dataset For 3D Indoor Scene Understanding Using Mobile RGB-D Data","work_id":"0ce910be-ca1c-44c7-b7b1-c5353759d85e","ref_index":1,"cited_arxiv_id":"2111.08897","is_internal_anchor":true},{"doi":"","year":2001,"title":"Virtual KITTI 2","work_id":"c0d9c030-aa25-44e7-9cc4-72d7403f1447","ref_index":2,"cited_arxiv_id":"2001.10773","is_internal_anchor":true},{"doi":"","year":null,"title":"FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning","work_id":"fff3953b-5efb-4753-bee4-002f59995810","ref_index":3,"cited_arxiv_id":"2307.08691","is_internal_anchor":true},{"doi":"","year":null,"title":"arXiv preprint arXiv:2412.06777 (2024)","work_id":"470cab2f-587e-4d05-a154-38f4daf22689","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Stream3r: Scalable sequential 3d reconstruction with causal transformer","work_id":"d910192e-ef5a-4704-ba04-be65066b9a28","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":16,"snapshot_sha256":"a156214b3a29f69a19f30999a9d65093f789d6f466a828b94e5ff44c54af21d4","internal_anchors":8},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f34c89f63a2cacdfc65db821e9cbd70f43996c54e4b7ee42616012c03e7eab72"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"}