{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:SCF7EQVEK6FTZY5KSXS4Z5R2SK","short_pith_number":"pith:SCF7EQVE","canonical_record":{"source":{"id":"2511.14751","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2025-11-18T18:52:22Z","cross_cats_sorted":["cs.RO"],"title_canon_sha256":"40221f17d47a6be85094c05fdf06a5182eca98f2189ae5158aac26e397e62cba","abstract_canon_sha256":"2aace5a36849de6fde91afc753105b5554fc5a4e04de5e821ea7947722357a4e"},"schema_version":"1.0"},"canonical_sha256":"908bf242a4578b3ce3aa95e5ccf63a92a0679976428f6443052c6f4c1669e621","source":{"kind":"arxiv","id":"2511.14751","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2511.14751","created_at":"2026-05-17T23:39:00Z"},{"alias_kind":"arxiv_version","alias_value":"2511.14751v2","created_at":"2026-05-17T23:39:00Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.14751","created_at":"2026-05-17T23:39:00Z"},{"alias_kind":"pith_short_12","alias_value":"SCF7EQVEK6FT","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"SCF7EQVEK6FTZY5K","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"SCF7EQVE","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:SCF7EQVEK6FTZY5KSXS4Z5R2SK","target":"record","payload":{"canonical_record":{"source":{"id":"2511.14751","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2025-11-18T18:52:22Z","cross_cats_sorted":["cs.RO"],"title_canon_sha256":"40221f17d47a6be85094c05fdf06a5182eca98f2189ae5158aac26e397e62cba","abstract_canon_sha256":"2aace5a36849de6fde91afc753105b5554fc5a4e04de5e821ea7947722357a4e"},"schema_version":"1.0"},"canonical_sha256":"908bf242a4578b3ce3aa95e5ccf63a92a0679976428f6443052c6f4c1669e621","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:00.734872Z","signature_b64":"uYbZfS30DP5oPCvjLR7r54tOYbiX5IJagJmCBozEP4xFx7hIm/WZ0IAk50CFKBf7SiB9G+NstTPnmdHvQlXNDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"908bf242a4578b3ce3aa95e5ccf63a92a0679976428f6443052c6f4c1669e621","last_reissued_at":"2026-05-17T23:39:00.734159Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:00.734159Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2511.14751","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:00Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"oNAmk8N72xEEP4OYvm+LngGyJ4uPjUMn7f59CMlzd+9pOZiB3LB4pp6q/JnV29GFUnZ9I151S9jaoJwU/i8ZCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T07:36:22.564709Z"},"content_sha256":"ea5ffc72f09e3d635546402eb451b5585b43fd597cc6a19de9389875e972c9f4","schema_version":"1.0","event_id":"sha256:ea5ffc72f09e3d635546402eb451b5585b43fd597cc6a19de9389875e972c9f4"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:SCF7EQVEK6FTZY5KSXS4Z5R2SK","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Co-Me: Confidence-Guided Token Merging for Visual Geometric Transformers","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"A distilled confidence predictor ranks and merges low-uncertainty tokens to accelerate visual geometric transformers up to 21 times without retraining.","cross_cats":["cs.RO"],"primary_cat":"cs.CV","authors_text":"Ali Agha, Jay Patrikar, Ruogu Li, Sebastian Scherer, Shayegan Omidshafiei, Yuheng Qiu, Yutian Chen","submitted_at":"2025-11-18T18:52:22Z","abstract_excerpt":"We propose Confidence-Guided Token Merging (Co-Me), an acceleration mechanism for visual geometric transformers without retraining or finetuning the base model. Co-Me distilled a light-weight confidence predictor to rank tokens by uncertainty and selectively merge low-confidence ones, effectively reducing computation while maintaining spatial coverage. Compared to similarity-based merging or pruning, the confidence signal in Co-Me reliably indicates regions emphasized by the transformer, enabling substantial acceleration without degrading performance. Co-Me applies seamlessly to various multi-"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"When applied to VGGT and Pi3, Co-Me achieves up to 21.5x and 20.4x speedup, making visual geometric transformers practical for real-time 3D perception and reconstruction.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That a distilled lightweight confidence predictor can reliably rank tokens by uncertainty in a manner that matches regions emphasized by the transformer, enabling substantial acceleration without degrading performance across multi-view and streaming setups.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Co-Me distills a confidence predictor to selectively merge low-confidence tokens in visual geometric transformers, delivering up to 21.5x speedup on VGGT and 20.4x on Pi3 while preserving spatial coverage and performance.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A distilled confidence predictor ranks and merges low-uncertainty tokens to accelerate visual geometric transformers up to 21 times without retraining.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"4930bc7846b2de09a194ff8830bc0a72a1169d406c5444172e4265aad6feba26"},"source":{"id":"2511.14751","kind":"arxiv","version":2},"verdict":{"id":"7877cbb3-6a7e-4fcc-85d1-b3d1058155f4","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T20:32:47.511581Z","strongest_claim":"When applied to VGGT and Pi3, Co-Me achieves up to 21.5x and 20.4x speedup, making visual geometric transformers practical for real-time 3D perception and reconstruction.","one_line_summary":"Co-Me distills a confidence predictor to selectively merge low-confidence tokens in visual geometric transformers, delivering up to 21.5x speedup on VGGT and 20.4x on Pi3 while preserving spatial coverage and performance.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That a distilled lightweight confidence predictor can reliably rank tokens by uncertainty in a manner that matches regions emphasized by the transformer, enabling substantial acceleration without degrading performance across multi-view and streaming setups.","pith_extraction_headline":"A distilled confidence predictor ranks and merges low-uncertainty tokens to accelerate visual geometric transformers up to 21 times without retraining."},"references":{"count":44,"sample":[{"doi":"","year":2016,"title":"Large-scale data for multiple-view stereopsis.International Journal of Computer Vision, pages 1–16, 2016","work_id":"9704adcc-e422-4599-9c99-4af55ad87af3","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Token merging for fast sta- ble diffusion","work_id":"9cc69a3c-fb65-454e-9bce-9b4061b94cf3","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Token merging: Your vit but faster, 2023","work_id":"2324203d-0a9b-4019-a556-d1c919dfe818","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2005,"title":"Learning to rank using gradient descent","work_id":"d887c317-abd8-498b-b55b-3518fb99e431","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Must3r: Multi-view network for stereo 3d reconstruc- tion, 2025","work_id":"9dc0ed82-1ee5-4402-9ebe-6b4f3b134f3e","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":44,"snapshot_sha256":"e941ba9571ed93b244538976c77d18782039c36c97762e105e9e957403bdf1de","internal_anchors":2},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"7877cbb3-6a7e-4fcc-85d1-b3d1058155f4"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:00Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"scXeKg4RogqUNP9EJ+QYDStseJSzwa0Gg0doQ++u/2GnGChK1m+JoJPvB6sBdz/C5Q7wGENjyFUBeFE5MjWhCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T07:36:22.565341Z"},"content_sha256":"0932995c224aab80c479a700df2165f8f9c0b9419e387c6a1ae593ba1dc36b56","schema_version":"1.0","event_id":"sha256:0932995c224aab80c479a700df2165f8f9c0b9419e387c6a1ae593ba1dc36b56"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK/bundle.json","state_url":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T07:36:22Z","links":{"resolver":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK","bundle":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK/bundle.json","state":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK/state.json","well_known_bundle":"https://pith.science/.well-known/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:SCF7EQVEK6FTZY5KSXS4Z5R2SK","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"2aace5a36849de6fde91afc753105b5554fc5a4e04de5e821ea7947722357a4e","cross_cats_sorted":["cs.RO"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2025-11-18T18:52:22Z","title_canon_sha256":"40221f17d47a6be85094c05fdf06a5182eca98f2189ae5158aac26e397e62cba"},"schema_version":"1.0","source":{"id":"2511.14751","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2511.14751","created_at":"2026-05-17T23:39:00Z"},{"alias_kind":"arxiv_version","alias_value":"2511.14751v2","created_at":"2026-05-17T23:39:00Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.14751","created_at":"2026-05-17T23:39:00Z"},{"alias_kind":"pith_short_12","alias_value":"SCF7EQVEK6FT","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"SCF7EQVEK6FTZY5K","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"SCF7EQVE","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:0932995c224aab80c479a700df2165f8f9c0b9419e387c6a1ae593ba1dc36b56","target":"graph","created_at":"2026-05-17T23:39:00Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"When applied to VGGT and Pi3, Co-Me achieves up to 21.5x and 20.4x speedup, making visual geometric transformers practical for real-time 3D perception and reconstruction."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That a distilled lightweight confidence predictor can reliably rank tokens by uncertainty in a manner that matches regions emphasized by the transformer, enabling substantial acceleration without degrading performance across multi-view and streaming setups."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Co-Me distills a confidence predictor to selectively merge low-confidence tokens in visual geometric transformers, delivering up to 21.5x speedup on VGGT and 20.4x on Pi3 while preserving spatial coverage and performance."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A distilled confidence predictor ranks and merges low-uncertainty tokens to accelerate visual geometric transformers up to 21 times without retraining."}],"snapshot_sha256":"4930bc7846b2de09a194ff8830bc0a72a1169d406c5444172e4265aad6feba26"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"We propose Confidence-Guided Token Merging (Co-Me), an acceleration mechanism for visual geometric transformers without retraining or finetuning the base model. Co-Me distilled a light-weight confidence predictor to rank tokens by uncertainty and selectively merge low-confidence ones, effectively reducing computation while maintaining spatial coverage. Compared to similarity-based merging or pruning, the confidence signal in Co-Me reliably indicates regions emphasized by the transformer, enabling substantial acceleration without degrading performance. Co-Me applies seamlessly to various multi-","authors_text":"Ali Agha, Jay Patrikar, Ruogu Li, Sebastian Scherer, Shayegan Omidshafiei, Yuheng Qiu, Yutian Chen","cross_cats":["cs.RO"],"headline":"A distilled confidence predictor ranks and merges low-uncertainty tokens to accelerate visual geometric transformers up to 21 times without retraining.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2025-11-18T18:52:22Z","title":"Co-Me: Confidence-Guided Token Merging for Visual Geometric Transformers"},"references":{"count":44,"internal_anchors":2,"resolved_work":44,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Large-scale data for multiple-view stereopsis.International Journal of Computer Vision, pages 1–16, 2016","work_id":"9704adcc-e422-4599-9c99-4af55ad87af3","year":2016},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Token merging for fast sta- ble diffusion","work_id":"9cc69a3c-fb65-454e-9bce-9b4061b94cf3","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Token merging: Your vit but faster, 2023","work_id":"2324203d-0a9b-4019-a556-d1c919dfe818","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Learning to rank using gradient descent","work_id":"d887c317-abd8-498b-b55b-3518fb99e431","year":2005},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Must3r: Multi-view network for stereo 3d reconstruc- tion, 2025","work_id":"9dc0ed82-1ee5-4402-9ebe-6b4f3b134f3e","year":2025}],"snapshot_sha256":"e941ba9571ed93b244538976c77d18782039c36c97762e105e9e957403bdf1de"},"source":{"id":"2511.14751","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-17T20:32:47.511581Z","id":"7877cbb3-6a7e-4fcc-85d1-b3d1058155f4","model_set":{"reader":"grok-4.3"},"one_line_summary":"Co-Me distills a confidence predictor to selectively merge low-confidence tokens in visual geometric transformers, delivering up to 21.5x speedup on VGGT and 20.4x on Pi3 while preserving spatial coverage and performance.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A distilled confidence predictor ranks and merges low-uncertainty tokens to accelerate visual geometric transformers up to 21 times without retraining.","strongest_claim":"When applied to VGGT and Pi3, Co-Me achieves up to 21.5x and 20.4x speedup, making visual geometric transformers practical for real-time 3D perception and reconstruction.","weakest_assumption":"That a distilled lightweight confidence predictor can reliably rank tokens by uncertainty in a manner that matches regions emphasized by the transformer, enabling substantial acceleration without degrading performance across multi-view and streaming setups."}},"verdict_id":"7877cbb3-6a7e-4fcc-85d1-b3d1058155f4"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:ea5ffc72f09e3d635546402eb451b5585b43fd597cc6a19de9389875e972c9f4","target":"record","created_at":"2026-05-17T23:39:00Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"2aace5a36849de6fde91afc753105b5554fc5a4e04de5e821ea7947722357a4e","cross_cats_sorted":["cs.RO"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2025-11-18T18:52:22Z","title_canon_sha256":"40221f17d47a6be85094c05fdf06a5182eca98f2189ae5158aac26e397e62cba"},"schema_version":"1.0","source":{"id":"2511.14751","kind":"arxiv","version":2}},"canonical_sha256":"908bf242a4578b3ce3aa95e5ccf63a92a0679976428f6443052c6f4c1669e621","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"908bf242a4578b3ce3aa95e5ccf63a92a0679976428f6443052c6f4c1669e621","first_computed_at":"2026-05-17T23:39:00.734159Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:00.734159Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"uYbZfS30DP5oPCvjLR7r54tOYbiX5IJagJmCBozEP4xFx7hIm/WZ0IAk50CFKBf7SiB9G+NstTPnmdHvQlXNDQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:00.734872Z","signed_message":"canonical_sha256_bytes"},"source_id":"2511.14751","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:ea5ffc72f09e3d635546402eb451b5585b43fd597cc6a19de9389875e972c9f4","sha256:0932995c224aab80c479a700df2165f8f9c0b9419e387c6a1ae593ba1dc36b56"],"state_sha256":"3c469ad21b38aa9b698a8b2f64d7f00ae4a060677ef074bea331d868b5574201"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"dfGu/vQhXkIbLFK6jocpT5pTlA92M53RyjH0ySATErk8+/exnfxoX7uPEBBw8GyXqZRmRkxVb+nPkYXxYdN5DQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T07:36:22.567932Z","bundle_sha256":"b5159b4f81f8e198f0ff6029450ae33fb9792b0e4dd8d68cc0cf06062660f0c4"}}