{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:YM42HKXD6S3LED2CNKOXBWOBDP","short_pith_number":"pith:YM42HKXD","schema_version":"1.0","canonical_sha256":"c339a3aae3f4b6b20f426a9d70d9c11be38c5f54bd00b4850515e5cdcbb6c6b7","source":{"kind":"arxiv","id":"2507.11539","version":2},"attestation_state":"computed","paper":{"title":"Streaming 4D Visual Geometry Transformer","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A causal streaming transformer reconstructs 3D geometry from video online by caching historical frame information.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CV","authors_text":"Dong Zhuo, Jiahe Guo, Jie Zhou, Jiwen Lu, Wenzhao Zheng, Yuqi Wu","submitted_at":"2025-07-15T17:59:57Z","abstract_excerpt":"Perceiving and reconstructing 3D geometry from videos is a fundamental yet challenging computer vision task. To facilitate interactive and low-latency applications, we propose a streaming visual geometry transformer that shares a similar philosophy with autoregressive large language models. We explore a simple and efficient design and employ a causal transformer architecture to process the input sequence in an online manner. We use temporal causal attention and cache the historical keys and values as implicit memory to enable efficient streaming long-term 3D reconstruction. This design can han"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2507.11539","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-07-15T17:59:57Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"a86349b28d8291141219b29bb327fe73eee0eb6d2c60c7b5fc5c6e1ca945a1a7","abstract_canon_sha256":"4b1a119db23ba2a0d8f6e189c0b73963557d87527dc3dd2967be309373f91287"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.862866Z","signature_b64":"yXXdbqLd/K3N67mn+mWlaBgfzM7j5U9N5FTu+t+A3EWucSj+lAFvaBb//5szMLI3TxODoUv+qw6xAZItzfphCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c339a3aae3f4b6b20f426a9d70d9c11be38c5f54bd00b4850515e5cdcbb6c6b7","last_reissued_at":"2026-05-17T23:38:49.862257Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.862257Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Streaming 4D Visual Geometry Transformer","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A causal streaming transformer reconstructs 3D geometry from video online by caching historical frame information.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CV","authors_text":"Dong Zhuo, Jiahe Guo, Jie Zhou, Jiwen Lu, Wenzhao Zheng, Yuqi Wu","submitted_at":"2025-07-15T17:59:57Z","abstract_excerpt":"Perceiving and reconstructing 3D geometry from videos is a fundamental yet challenging computer vision task. To facilitate interactive and low-latency applications, we propose a streaming visual geometry transformer that shares a similar philosophy with autoregressive large language models. We explore a simple and efficient design and employ a causal transformer architecture to process the input sequence in an online manner. We use temporal causal attention and cache the historical keys and values as implicit memory to enable efficient streaming long-term 3D reconstruction. This design can han"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our model enhances inference speed in online scenarios while maintaining competitive performance, thereby facilitating scalable and interactive 3D vision systems.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That knowledge distilled from the dense bidirectional VGGT model transfers effectively to the causal streaming architecture without losing critical spatial consistency over long sequences.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"A causal transformer with key-value caching and distillation from a bidirectional VGGT model enables efficient online 4D geometry reconstruction from videos.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A causal streaming transformer reconstructs 3D geometry from video online by caching historical frame information.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"42b5a3149488b3596ede80968ebda37e1804704b69b77b24234b04904b39ceeb"},"source":{"id":"2507.11539","kind":"arxiv","version":2},"verdict":{"id":"55fdb9db-55bb-4b51-be2b-cb1e642af680","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T22:54:47.463876Z","strongest_claim":"Our model enhances inference speed in online scenarios while maintaining competitive performance, thereby facilitating scalable and interactive 3D vision systems.","one_line_summary":"A causal transformer with key-value caching and distillation from a bidirectional VGGT model enables efficient online 4D geometry reconstruction from videos.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That knowledge distilled from the dense bidirectional VGGT model transfers effectively to the causal streaming architecture without losing critical spatial consistency over long sequences.","pith_extraction_headline":"A causal streaming transformer reconstructs 3D geometry from video online by caching historical frame information."},"references":{"count":16,"sample":[{"doi":"","year":null,"title":"ARKitScenes: A Diverse Real-World Dataset For 3D Indoor Scene Understanding Using Mobile RGB-D Data","work_id":"0ce910be-ca1c-44c7-b7b1-c5353759d85e","ref_index":1,"cited_arxiv_id":"2111.08897","is_internal_anchor":true},{"doi":"","year":2001,"title":"Virtual KITTI 2","work_id":"c0d9c030-aa25-44e7-9cc4-72d7403f1447","ref_index":2,"cited_arxiv_id":"2001.10773","is_internal_anchor":true},{"doi":"","year":null,"title":"FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning","work_id":"fff3953b-5efb-4753-bee4-002f59995810","ref_index":3,"cited_arxiv_id":"2307.08691","is_internal_anchor":true},{"doi":"","year":null,"title":"arXiv preprint arXiv:2412.06777 (2024)","work_id":"470cab2f-587e-4d05-a154-38f4daf22689","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Stream3r: Scalable sequential 3d reconstruction with causal transformer","work_id":"d910192e-ef5a-4704-ba04-be65066b9a28","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":16,"snapshot_sha256":"a156214b3a29f69a19f30999a9d65093f789d6f466a828b94e5ff44c54af21d4","internal_anchors":8},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f34c89f63a2cacdfc65db821e9cbd70f43996c54e4b7ee42616012c03e7eab72"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2507.11539","created_at":"2026-05-17T23:38:49.862357+00:00"},{"alias_kind":"arxiv_version","alias_value":"2507.11539v2","created_at":"2026-05-17T23:38:49.862357+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2507.11539","created_at":"2026-05-17T23:38:49.862357+00:00"},{"alias_kind":"pith_short_12","alias_value":"YM42HKXD6S3L","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"YM42HKXD6S3LED2C","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"YM42HKXD","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":22,"internal_anchor_count":22,"sample":[{"citing_arxiv_id":"2510.17568","citing_title":"PAGE-4D: VGGT-4D Perception via Disentangled Pose and Geometry Estimation","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2511.14751","citing_title":"Co-Me: Confidence-Guided Token Merging for Visual Geometric Transformers","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2509.26645","citing_title":"TTT3R: 3D Reconstruction as Test-Time Training","ref_index":106,"is_internal_anchor":true},{"citing_arxiv_id":"2509.02560","citing_title":"FastVGGT: Training-Free Acceleration of Visual Geometry Transformer","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2603.07690","citing_title":"FrameVGGT: Geometry-Aligned Frame-Level Memory for Bounded Streaming VGGT","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2603.20284","citing_title":"STAC: Plug-and-Play Spatio-Temporal Aware Cache Compression for Streaming 3D Reconstruction","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2604.00813","citing_title":"DVGT-2: Vision-Geometry-Action Model for Autonomous Driving at Scale","ref_index":92,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11367","citing_title":"3D-Belief: Embodied Belief Inference via Generative 3D World Modeling","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08371","citing_title":"PaceVGGT: Pre-Alternating-Attention Token Pruning for Visual Geometry Transformers","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09644","citing_title":"Attention Itself Could Retrieve.RetrieveVGGT: Training-Free Long Context Streaming 3D Reconstruction via Query-Key Similarity Retrieval","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06270","citing_title":"Spark3R: Asymmetric Token Reduction Makes Fast Feed-Forward 3D Reconstruction","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2604.21915","citing_title":"Vista4D: Video Reshooting with 4D Point Clouds","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10098","citing_title":"Attention Sink in Transformers: A Survey on Utilization, Interpretation, and Mitigation","ref_index":195,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08542","citing_title":"Scal3R: Scalable Test-Time Training for Large-Scale 3D Reconstruction","ref_index":101,"is_internal_anchor":true},{"citing_arxiv_id":"2511.10647","citing_title":"Depth Anything 3: Recovering the Visual Space from Any Views","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07350","citing_title":"Fast Spatial Memory with Elastic Test-Time Training","ref_index":78,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07279","citing_title":"Mem3R: Streaming 3D Reconstruction with Hybrid Memory via Test-Time Training","ref_index":80,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05351","citing_title":"AnyImageNav: Any-View Geometry for Precise Last-Meter Image-Goal Navigation","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14025","citing_title":"Feed-Forward 3D Scene Modeling: A Problem-Driven Perspective","ref_index":103,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15237","citing_title":"StreamCacheVGGT: Streaming Visual Geometry Transformers with Robust Scoring and Hybrid Cache Compression","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15284","citing_title":"GlobalSplat: Efficient Feed-Forward 3D Gaussian Splatting via Global Scene Tokens","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18260","citing_title":"Geometry-Guided 3D Visual Token Pruning for Video-Language Models","ref_index":53,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YM42HKXD6S3LED2CNKOXBWOBDP","json":"https://pith.science/pith/YM42HKXD6S3LED2CNKOXBWOBDP.json","graph_json":"https://pith.science/api/pith-number/YM42HKXD6S3LED2CNKOXBWOBDP/graph.json","events_json":"https://pith.science/api/pith-number/YM42HKXD6S3LED2CNKOXBWOBDP/events.json","paper":"https://pith.science/paper/YM42HKXD"},"agent_actions":{"view_html":"https://pith.science/pith/YM42HKXD6S3LED2CNKOXBWOBDP","download_json":"https://pith.science/pith/YM42HKXD6S3LED2CNKOXBWOBDP.json","view_paper":"https://pith.science/paper/YM42HKXD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2507.11539&json=true","fetch_graph":"https://pith.science/api/pith-number/YM42HKXD6S3LED2CNKOXBWOBDP/graph.json","fetch_events":"https://pith.science/api/pith-number/YM42HKXD6S3LED2CNKOXBWOBDP/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YM42HKXD6S3LED2CNKOXBWOBDP/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YM42HKXD6S3LED2CNKOXBWOBDP/action/storage_attestation","attest_author":"https://pith.science/pith/YM42HKXD6S3LED2CNKOXBWOBDP/action/author_attestation","sign_citation":"https://pith.science/pith/YM42HKXD6S3LED2CNKOXBWOBDP/action/citation_signature","submit_replication":"https://pith.science/pith/YM42HKXD6S3LED2CNKOXBWOBDP/action/replication_record"}},"created_at":"2026-05-17T23:38:49.862357+00:00","updated_at":"2026-05-17T23:38:49.862357+00:00"}