{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:K4CIF6ELOWVEHIM7A4BOYW5EGV","short_pith_number":"pith:K4CIF6EL","schema_version":"1.0","canonical_sha256":"570482f88b75aa43a19f0702ec5ba435602d63f91baae81575c8c22a8e57762a","source":{"kind":"arxiv","id":"2604.05718","version":1},"attestation_state":"computed","paper":{"title":"MPM: Mutual Pair Merging for Efficient Vision Transformers","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"Mutual Pair Merging shortens vision transformer sequences for semantic segmentation by averaging mutual nearest-neighbor token pairs while preserving reconstruction for existing decoders.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"David Rousseau, Pejman Rasti, Simon Rav\\'e","submitted_at":"2026-04-07T11:16:18Z","abstract_excerpt":"Decreasing sequence length is a common way to accelerate transformers, but prior token reduction work often targets classification and reports proxy metrics rather than end-to-end latency. For semantic segmentation, token reduction is further constrained by the need to reconstruct dense, pixel-aligned features, and on modern accelerators the overhead of computing merge maps can erase expected gains. We propose Mutual Pair Merging (MPM), a training-free token aggregation module that forms mutual nearest-neighbor pairs in cosine space, averages each pair, and records a merge map enabling a gathe"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2604.05718","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-07T11:16:18Z","cross_cats_sorted":[],"title_canon_sha256":"9cae031688c14722c93202bc3e868fec19939cee50f2666bf964b449622204af","abstract_canon_sha256":"7968cca217d129e3bae8b2cd1b9e35f83ac114439fddf5cb883732628f028cbd"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-03T01:05:50.229286Z","signature_b64":"XdvlgYfELA22Kt9HRahgeRc9D8xN2VhHtpmu3dezCv5Ag1MRAo//4KtLW9lJ+RsikpvWmbcUdW0WwoRMxSIXCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"570482f88b75aa43a19f0702ec5ba435602d63f91baae81575c8c22a8e57762a","last_reissued_at":"2026-06-03T01:05:50.228775Z","signature_status":"signed_v1","first_computed_at":"2026-06-03T01:05:50.228775Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"MPM: Mutual Pair Merging for Efficient Vision Transformers","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"Mutual Pair Merging shortens vision transformer sequences for semantic segmentation by averaging mutual nearest-neighbor token pairs while preserving reconstruction for existing decoders.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"David Rousseau, Pejman Rasti, Simon Rav\\'e","submitted_at":"2026-04-07T11:16:18Z","abstract_excerpt":"Decreasing sequence length is a common way to accelerate transformers, but prior token reduction work often targets classification and reports proxy metrics rather than end-to-end latency. For semantic segmentation, token reduction is further constrained by the need to reconstruct dense, pixel-aligned features, and on modern accelerators the overhead of computing merge maps can erase expected gains. We propose Mutual Pair Merging (MPM), a training-free token aggregation module that forms mutual nearest-neighbor pairs in cosine space, averages each pair, and records a merge map enabling a gathe"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"On ADE20K, MPM reduces per-image latency by up to 60% for ViT-Tiny on Raspberry Pi 5, and increases throughput by up to 20% on H100 with FlashAttention-2 while keeping the mIoU drop below 3%.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the overhead of computing mutual nearest-neighbor pairs and the subsequent gather-based reconstruction remains small enough on the target hardware to produce net latency gains, and that the merge map allows existing segmentation heads to be used unchanged without further accuracy degradation.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"MPM merges mutual nearest-neighbor token pairs in cosine space for ViTs, records a merge map for reconstruction, and delivers up to 60% latency reduction on Raspberry Pi 5 and 20% throughput gain on H100 with under 3% mIoU drop on ADE20K.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Mutual Pair Merging shortens vision transformer sequences for semantic segmentation by averaging mutual nearest-neighbor token pairs while preserving reconstruction for existing decoders.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b9123a0d45c688e543570f60af56a24c3eb0d36173bab93f059781c17c9af326"},"source":{"id":"2604.05718","kind":"arxiv","version":1},"verdict":{"id":"65fa1ae2-182a-4a2d-8feb-8aa2703411c6","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-10T19:18:10.526578Z","strongest_claim":"On ADE20K, MPM reduces per-image latency by up to 60% for ViT-Tiny on Raspberry Pi 5, and increases throughput by up to 20% on H100 with FlashAttention-2 while keeping the mIoU drop below 3%.","one_line_summary":"MPM merges mutual nearest-neighbor token pairs in cosine space for ViTs, records a merge map for reconstruction, and delivers up to 60% latency reduction on Raspberry Pi 5 and 20% throughput gain on H100 with under 3% mIoU drop on ADE20K.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the overhead of computing mutual nearest-neighbor pairs and the subsequent gather-based reconstruction remains small enough on the target hardware to produce net latency gains, and that the merge map allows existing segmentation heads to be used unchanged without further accuracy degradation.","pith_extraction_headline":"Mutual Pair Merging shortens vision transformer sequences for semantic segmentation by averaging mutual nearest-neighbor token pairs while preserving reconstruction for existing decoders."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.05718/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":39,"sample":[{"doi":"","year":2025,"title":"Token cropr: Faster vits for quite a few tasks","work_id":"8f3bdbf4-0bf1-4d98-a5ef-e39bafe76528","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Token merging: Your vit but faster","work_id":"e9ce772c-28c2-47ad-a3ac-a4bea2bb446c","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Vision transformer adapter for dense predictions","work_id":"0cf83e0f-7c9b-4167-a57f-3af8337fc67b","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Schwing, Alexan- der Kirillov, and Rohit Girdhar","work_id":"3687301f-1f89-440a-b46a-062ac769085d","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"IEEE, 2022. 1, 2, 7, 8","work_id":"890f0026-bc66-4ec9-a38a-20723435b72f","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":39,"snapshot_sha256":"d24d11fc5e3d548db8830b6a63ca6f1257cbe97c09cef804e3587145b8f7054d","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"4c35980618a6145420ee648cc4af0395d2edf29d00359de86c7a9ad4093a1793"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2604.05718","created_at":"2026-06-03T01:05:50.228856+00:00"},{"alias_kind":"arxiv_version","alias_value":"2604.05718v1","created_at":"2026-06-03T01:05:50.228856+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.05718","created_at":"2026-06-03T01:05:50.228856+00:00"},{"alias_kind":"pith_short_12","alias_value":"K4CIF6ELOWVE","created_at":"2026-06-03T01:05:50.228856+00:00"},{"alias_kind":"pith_short_16","alias_value":"K4CIF6ELOWVEHIM7","created_at":"2026-06-03T01:05:50.228856+00:00"},{"alias_kind":"pith_short_8","alias_value":"K4CIF6EL","created_at":"2026-06-03T01:05:50.228856+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/K4CIF6ELOWVEHIM7A4BOYW5EGV","json":"https://pith.science/pith/K4CIF6ELOWVEHIM7A4BOYW5EGV.json","graph_json":"https://pith.science/api/pith-number/K4CIF6ELOWVEHIM7A4BOYW5EGV/graph.json","events_json":"https://pith.science/api/pith-number/K4CIF6ELOWVEHIM7A4BOYW5EGV/events.json","paper":"https://pith.science/paper/K4CIF6EL"},"agent_actions":{"view_html":"https://pith.science/pith/K4CIF6ELOWVEHIM7A4BOYW5EGV","download_json":"https://pith.science/pith/K4CIF6ELOWVEHIM7A4BOYW5EGV.json","view_paper":"https://pith.science/paper/K4CIF6EL","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2604.05718&json=true","fetch_graph":"https://pith.science/api/pith-number/K4CIF6ELOWVEHIM7A4BOYW5EGV/graph.json","fetch_events":"https://pith.science/api/pith-number/K4CIF6ELOWVEHIM7A4BOYW5EGV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/K4CIF6ELOWVEHIM7A4BOYW5EGV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/K4CIF6ELOWVEHIM7A4BOYW5EGV/action/storage_attestation","attest_author":"https://pith.science/pith/K4CIF6ELOWVEHIM7A4BOYW5EGV/action/author_attestation","sign_citation":"https://pith.science/pith/K4CIF6ELOWVEHIM7A4BOYW5EGV/action/citation_signature","submit_replication":"https://pith.science/pith/K4CIF6ELOWVEHIM7A4BOYW5EGV/action/replication_record"}},"created_at":"2026-06-03T01:05:50.228856+00:00","updated_at":"2026-06-03T01:05:50.228856+00:00"}