{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:SCF7EQVEK6FTZY5KSXS4Z5R2SK","short_pith_number":"pith:SCF7EQVE","schema_version":"1.0","canonical_sha256":"908bf242a4578b3ce3aa95e5ccf63a92a0679976428f6443052c6f4c1669e621","source":{"kind":"arxiv","id":"2511.14751","version":2},"attestation_state":"computed","paper":{"title":"Co-Me: Confidence-Guided Token Merging for Visual Geometric Transformers","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"A distilled confidence predictor ranks and merges low-uncertainty tokens to accelerate visual geometric transformers up to 21 times without retraining.","cross_cats":["cs.RO"],"primary_cat":"cs.CV","authors_text":"Ali Agha, Jay Patrikar, Ruogu Li, Sebastian Scherer, Shayegan Omidshafiei, Yuheng Qiu, Yutian Chen","submitted_at":"2025-11-18T18:52:22Z","abstract_excerpt":"We propose Confidence-Guided Token Merging (Co-Me), an acceleration mechanism for visual geometric transformers without retraining or finetuning the base model. Co-Me distilled a light-weight confidence predictor to rank tokens by uncertainty and selectively merge low-confidence ones, effectively reducing computation while maintaining spatial coverage. Compared to similarity-based merging or pruning, the confidence signal in Co-Me reliably indicates regions emphasized by the transformer, enabling substantial acceleration without degrading performance. Co-Me applies seamlessly to various multi-"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2511.14751","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2025-11-18T18:52:22Z","cross_cats_sorted":["cs.RO"],"title_canon_sha256":"40221f17d47a6be85094c05fdf06a5182eca98f2189ae5158aac26e397e62cba","abstract_canon_sha256":"2aace5a36849de6fde91afc753105b5554fc5a4e04de5e821ea7947722357a4e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:00.734872Z","signature_b64":"uYbZfS30DP5oPCvjLR7r54tOYbiX5IJagJmCBozEP4xFx7hIm/WZ0IAk50CFKBf7SiB9G+NstTPnmdHvQlXNDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"908bf242a4578b3ce3aa95e5ccf63a92a0679976428f6443052c6f4c1669e621","last_reissued_at":"2026-05-17T23:39:00.734159Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:00.734159Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Co-Me: Confidence-Guided Token Merging for Visual Geometric Transformers","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"A distilled confidence predictor ranks and merges low-uncertainty tokens to accelerate visual geometric transformers up to 21 times without retraining.","cross_cats":["cs.RO"],"primary_cat":"cs.CV","authors_text":"Ali Agha, Jay Patrikar, Ruogu Li, Sebastian Scherer, Shayegan Omidshafiei, Yuheng Qiu, Yutian Chen","submitted_at":"2025-11-18T18:52:22Z","abstract_excerpt":"We propose Confidence-Guided Token Merging (Co-Me), an acceleration mechanism for visual geometric transformers without retraining or finetuning the base model. Co-Me distilled a light-weight confidence predictor to rank tokens by uncertainty and selectively merge low-confidence ones, effectively reducing computation while maintaining spatial coverage. Compared to similarity-based merging or pruning, the confidence signal in Co-Me reliably indicates regions emphasized by the transformer, enabling substantial acceleration without degrading performance. Co-Me applies seamlessly to various multi-"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"When applied to VGGT and Pi3, Co-Me achieves up to 21.5x and 20.4x speedup, making visual geometric transformers practical for real-time 3D perception and reconstruction.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That a distilled lightweight confidence predictor can reliably rank tokens by uncertainty in a manner that matches regions emphasized by the transformer, enabling substantial acceleration without degrading performance across multi-view and streaming setups.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Co-Me distills a confidence predictor to selectively merge low-confidence tokens in visual geometric transformers, delivering up to 21.5x speedup on VGGT and 20.4x on Pi3 while preserving spatial coverage and performance.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A distilled confidence predictor ranks and merges low-uncertainty tokens to accelerate visual geometric transformers up to 21 times without retraining.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"4930bc7846b2de09a194ff8830bc0a72a1169d406c5444172e4265aad6feba26"},"source":{"id":"2511.14751","kind":"arxiv","version":2},"verdict":{"id":"7877cbb3-6a7e-4fcc-85d1-b3d1058155f4","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T20:32:47.511581Z","strongest_claim":"When applied to VGGT and Pi3, Co-Me achieves up to 21.5x and 20.4x speedup, making visual geometric transformers practical for real-time 3D perception and reconstruction.","one_line_summary":"Co-Me distills a confidence predictor to selectively merge low-confidence tokens in visual geometric transformers, delivering up to 21.5x speedup on VGGT and 20.4x on Pi3 while preserving spatial coverage and performance.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That a distilled lightweight confidence predictor can reliably rank tokens by uncertainty in a manner that matches regions emphasized by the transformer, enabling substantial acceleration without degrading performance across multi-view and streaming setups.","pith_extraction_headline":"A distilled confidence predictor ranks and merges low-uncertainty tokens to accelerate visual geometric transformers up to 21 times without retraining."},"references":{"count":44,"sample":[{"doi":"","year":2016,"title":"Large-scale data for multiple-view stereopsis.International Journal of Computer Vision, pages 1–16, 2016","work_id":"9704adcc-e422-4599-9c99-4af55ad87af3","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Token merging for fast sta- ble diffusion","work_id":"9cc69a3c-fb65-454e-9bce-9b4061b94cf3","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Token merging: Your vit but faster, 2023","work_id":"2324203d-0a9b-4019-a556-d1c919dfe818","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2005,"title":"Learning to rank using gradient descent","work_id":"d887c317-abd8-498b-b55b-3518fb99e431","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Must3r: Multi-view network for stereo 3d reconstruc- tion, 2025","work_id":"9dc0ed82-1ee5-4402-9ebe-6b4f3b134f3e","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":44,"snapshot_sha256":"e941ba9571ed93b244538976c77d18782039c36c97762e105e9e957403bdf1de","internal_anchors":2},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2511.14751","created_at":"2026-05-17T23:39:00.734295+00:00"},{"alias_kind":"arxiv_version","alias_value":"2511.14751v2","created_at":"2026-05-17T23:39:00.734295+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.14751","created_at":"2026-05-17T23:39:00.734295+00:00"},{"alias_kind":"pith_short_12","alias_value":"SCF7EQVEK6FT","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"SCF7EQVEK6FTZY5K","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"SCF7EQVE","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK","json":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK.json","graph_json":"https://pith.science/api/pith-number/SCF7EQVEK6FTZY5KSXS4Z5R2SK/graph.json","events_json":"https://pith.science/api/pith-number/SCF7EQVEK6FTZY5KSXS4Z5R2SK/events.json","paper":"https://pith.science/paper/SCF7EQVE"},"agent_actions":{"view_html":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK","download_json":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK.json","view_paper":"https://pith.science/paper/SCF7EQVE","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2511.14751&json=true","fetch_graph":"https://pith.science/api/pith-number/SCF7EQVEK6FTZY5KSXS4Z5R2SK/graph.json","fetch_events":"https://pith.science/api/pith-number/SCF7EQVEK6FTZY5KSXS4Z5R2SK/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK/action/timestamp_anchor","attest_storage":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK/action/storage_attestation","attest_author":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK/action/author_attestation","sign_citation":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK/action/citation_signature","submit_replication":"https://pith.science/pith/SCF7EQVEK6FTZY5KSXS4Z5R2SK/action/replication_record"}},"created_at":"2026-05-17T23:39:00.734295+00:00","updated_at":"2026-05-17T23:39:00.734295+00:00"}