{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:XROX5EOT5WQCIPFTJLROAAURZ2","short_pith_number":"pith:XROX5EOT","schema_version":"1.0","canonical_sha256":"bc5d7e91d3eda0243cb34ae2e00291cea20fb500dc93be8f2f1595d78aa30524","source":{"kind":"arxiv","id":"2604.11530","version":2},"attestation_state":"computed","paper":{"title":"Beyond Attention Scores: SVD-Based Vision Token Pruning for Efficient Vision-Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"SVD-Prune selects vision tokens via leverage scores from singular value decomposition to preserve essential content at extreme pruning ratios.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Martyna Poreba, Michal Szczepanski, Samia Bouchafa, Yvon Apedo","submitted_at":"2026-04-13T14:30:13Z","abstract_excerpt":"Vision-Language Models (VLMs) have revolutionized multi-modal learning by jointly processing visual and textual information. Yet, they face significant challenges due to the high computational and memory demands of processing long sequences of vision tokens. Many existing methods rely on local heuristics, such as attention scores or token norms. However, these criteria suffer from positional bias and information dispersion, limiting their ability to preserve essential content at high pruning ratios and leading to performance degradation on visually detailed images. To address these issues, we "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2604.11530","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-13T14:30:13Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"efcc9677468db9fa61e151da4a2b53c419b8d78dc88d7028ad43c345e5dfb938","abstract_canon_sha256":"98a7237cfec8c992eac5674f0dc418922b6b0e8ea5d831366f9a6f0d1365992d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T01:05:18.966034Z","signature_b64":"hg1KDkmLAkH8Jp8qDncv9YiFrWJJ7flh3ng2a4/9vyeJTg3w3CAgkRUvQIfuggyabY+C7hTetvNaMjC5Uh3ZBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"bc5d7e91d3eda0243cb34ae2e00291cea20fb500dc93be8f2f1595d78aa30524","last_reissued_at":"2026-05-21T01:05:18.965457Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T01:05:18.965457Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Beyond Attention Scores: SVD-Based Vision Token Pruning for Efficient Vision-Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"SVD-Prune selects vision tokens via leverage scores from singular value decomposition to preserve essential content at extreme pruning ratios.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Martyna Poreba, Michal Szczepanski, Samia Bouchafa, Yvon Apedo","submitted_at":"2026-04-13T14:30:13Z","abstract_excerpt":"Vision-Language Models (VLMs) have revolutionized multi-modal learning by jointly processing visual and textual information. Yet, they face significant challenges due to the high computational and memory demands of processing long sequences of vision tokens. Many existing methods rely on local heuristics, such as attention scores or token norms. However, these criteria suffer from positional bias and information dispersion, limiting their ability to preserve essential content at high pruning ratios and leading to performance degradation on visually detailed images. To address these issues, we "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experiments show that SVD-Prune consistently outperforms prior pruning methods under extreme vision token budgets, maintaining strong performance even with 32 and 16 vision tokens.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That statistical leverage scores derived from the SVD of the vision token feature matrix reliably identify tokens containing essential visual content without introducing positional bias or losing fine details on complex images.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SVD-Prune selects vision tokens via SVD leverage scores to keep performance high even when pruning to only 16-32 tokens.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"SVD-Prune selects vision tokens via leverage scores from singular value decomposition to preserve essential content at extreme pruning ratios.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"20b277ba6f928192cc126936d2d7bfff06ffd9fbe3d0348631af099c0a39bccb"},"source":{"id":"2604.11530","kind":"arxiv","version":2},"verdict":{"id":"13f0c14c-2d8a-4d3b-89d4-29594c21721b","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-10T16:22:57.618953Z","strongest_claim":"Experiments show that SVD-Prune consistently outperforms prior pruning methods under extreme vision token budgets, maintaining strong performance even with 32 and 16 vision tokens.","one_line_summary":"SVD-Prune selects vision tokens via SVD leverage scores to keep performance high even when pruning to only 16-32 tokens.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That statistical leverage scores derived from the SVD of the vision token feature matrix reliably identify tokens containing essential visual content without introducing positional bias or losing fine details on complex images.","pith_extraction_headline":"SVD-Prune selects vision tokens via leverage scores from singular value decomposition to preserve essential content at extreme pruning ratios."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.11530/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2604.11530","created_at":"2026-05-21T01:05:18.965522+00:00"},{"alias_kind":"arxiv_version","alias_value":"2604.11530v2","created_at":"2026-05-21T01:05:18.965522+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.11530","created_at":"2026-05-21T01:05:18.965522+00:00"},{"alias_kind":"pith_short_12","alias_value":"XROX5EOT5WQC","created_at":"2026-05-21T01:05:18.965522+00:00"},{"alias_kind":"pith_short_16","alias_value":"XROX5EOT5WQCIPFT","created_at":"2026-05-21T01:05:18.965522+00:00"},{"alias_kind":"pith_short_8","alias_value":"XROX5EOT","created_at":"2026-05-21T01:05:18.965522+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2604.11530","citing_title":"Beyond Attention Scores: SVD-Based Vision Token Pruning for Efficient Vision-Language Models","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2604.11530","citing_title":"Beyond Attention Scores: SVD-Based Vision Token Pruning for Efficient Vision-Language Models","ref_index":2,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XROX5EOT5WQCIPFTJLROAAURZ2","json":"https://pith.science/pith/XROX5EOT5WQCIPFTJLROAAURZ2.json","graph_json":"https://pith.science/api/pith-number/XROX5EOT5WQCIPFTJLROAAURZ2/graph.json","events_json":"https://pith.science/api/pith-number/XROX5EOT5WQCIPFTJLROAAURZ2/events.json","paper":"https://pith.science/paper/XROX5EOT"},"agent_actions":{"view_html":"https://pith.science/pith/XROX5EOT5WQCIPFTJLROAAURZ2","download_json":"https://pith.science/pith/XROX5EOT5WQCIPFTJLROAAURZ2.json","view_paper":"https://pith.science/paper/XROX5EOT","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2604.11530&json=true","fetch_graph":"https://pith.science/api/pith-number/XROX5EOT5WQCIPFTJLROAAURZ2/graph.json","fetch_events":"https://pith.science/api/pith-number/XROX5EOT5WQCIPFTJLROAAURZ2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XROX5EOT5WQCIPFTJLROAAURZ2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XROX5EOT5WQCIPFTJLROAAURZ2/action/storage_attestation","attest_author":"https://pith.science/pith/XROX5EOT5WQCIPFTJLROAAURZ2/action/author_attestation","sign_citation":"https://pith.science/pith/XROX5EOT5WQCIPFTJLROAAURZ2/action/citation_signature","submit_replication":"https://pith.science/pith/XROX5EOT5WQCIPFTJLROAAURZ2/action/replication_record"}},"created_at":"2026-05-21T01:05:18.965522+00:00","updated_at":"2026-05-21T01:05:18.965522+00:00"}