{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:BLQDQJEISWHP334A43JMKP6TMH","short_pith_number":"pith:BLQDQJEI","canonical_record":{"source":{"id":"2602.04657","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-02-04T15:33:10Z","cross_cats_sorted":[],"title_canon_sha256":"9ec91120c352d9e8cf72dcdd2ab0f3419da85bff1dcbf048010ee28f72d3ec88","abstract_canon_sha256":"274d100576cf7ef06b3118ef63ceb4875086938a2e4827733a3a3a6e1290afd5"},"schema_version":"1.0"},"canonical_sha256":"0ae0382488958efdef80e6d2c53fd361f5351c15ead05b1e0d2bfc760240441d","source":{"kind":"arxiv","id":"2602.04657","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.04657","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"arxiv_version","alias_value":"2602.04657v3","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.04657","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"pith_short_12","alias_value":"BLQDQJEISWHP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BLQDQJEISWHP334A","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BLQDQJEI","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:BLQDQJEISWHP334A43JMKP6TMH","target":"record","payload":{"canonical_record":{"source":{"id":"2602.04657","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-02-04T15:33:10Z","cross_cats_sorted":[],"title_canon_sha256":"9ec91120c352d9e8cf72dcdd2ab0f3419da85bff1dcbf048010ee28f72d3ec88","abstract_canon_sha256":"274d100576cf7ef06b3118ef63ceb4875086938a2e4827733a3a3a6e1290afd5"},"schema_version":"1.0"},"canonical_sha256":"0ae0382488958efdef80e6d2c53fd361f5351c15ead05b1e0d2bfc760240441d","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:16.348839Z","signature_b64":"mOZYunNxaCz3T16GcFmVhPw/NllDtrjy9PoliPgXchUGxGzpECjesTLPkrqSkYT5/m744toPnhLIXhZwyKasDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0ae0382488958efdef80e6d2c53fd361f5351c15ead05b1e0d2bfc760240441d","last_reissued_at":"2026-05-17T23:39:16.348136Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:16.348136Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.04657","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:16Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"eGvPgG6x5LyQ/DnlJIqHcu/uJeuTcm7JE7uzAIZl4qL9hrtnMlimHsm374OloDLfDMuMo/VwN+AI1b/9PRB4CQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-02T09:59:01.102581Z"},"content_sha256":"e773bc7c4ff35b20ddd7d92b8419c2e07fb62d3044082d4955690526895f8552","schema_version":"1.0","event_id":"sha256:e773bc7c4ff35b20ddd7d92b8419c2e07fb62d3044082d4955690526895f8552"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:BLQDQJEISWHP334A43JMKP6TMH","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"TRIO: Token Reduction via Inference-Objective Guidance for Efficient Vision-Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"TRIO reduces visual tokens in vision-language models to 11 percent while retaining 97 percent performance by selecting tokens whose removal leaves the final output unchanged.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Congyang Ou, Dawei Yan, Haokui Zhang, Peng Wang, Qingsen Yan, Rong Xiao, Ying Li, Yu Zhang","submitted_at":"2026-02-04T15:33:10Z","abstract_excerpt":"Recently, reducing redundant visual tokens in vision-language models (VLMs) to accelerate VLM inference has emerged as a hot topic. However, most existing methods rely on heuristics constructed based on inter-visual-token similarity or cross-modal visual-text similarity, which gives rise to certain limitations in compression performance and practical deployment. In contrast, we propose TRIO from the perspective of inference objectives, which transforms visual token compression into preserving output result invariance and selects tokens primarily by their importance to this goal. Specifically, "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"On LLaVA-Next-7B, TRIO retains just 11.1% of visual tokens but maintains 97.2% of the original performance, with a 2.75× prefill speedup, 2.14× inference speedup, 6.22× lower FLOPs, and 6.05× reduced KV Cache overhead.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the designed layer-local proxy loss produces token-level gradient saliency that reliably identifies tokens whose removal leaves the final output essentially unchanged, without requiring full end-to-end gradients or task-specific tuning.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"TRIO keeps 97.2% performance on LLaVA-Next-7B using only 11.1% visual tokens, yielding 2.75x prefill speedup and 6x lower FLOPs via inference-objective gradient guidance.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"TRIO reduces visual tokens in vision-language models to 11 percent while retaining 97 percent performance by selecting tokens whose removal leaves the final output unchanged.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b6987d672ca0113a476f0d9489906d44b123b6df893a7c7a0d750c21381d41d4"},"source":{"id":"2602.04657","kind":"arxiv","version":3},"verdict":{"id":"dc21b94d-e47f-4db3-a844-b27f49e2a308","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T07:24:57.792012Z","strongest_claim":"On LLaVA-Next-7B, TRIO retains just 11.1% of visual tokens but maintains 97.2% of the original performance, with a 2.75× prefill speedup, 2.14× inference speedup, 6.22× lower FLOPs, and 6.05× reduced KV Cache overhead.","one_line_summary":"TRIO keeps 97.2% performance on LLaVA-Next-7B using only 11.1% visual tokens, yielding 2.75x prefill speedup and 6x lower FLOPs via inference-objective gradient guidance.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the designed layer-local proxy loss produces token-level gradient saliency that reliably identifies tokens whose removal leaves the final output essentially unchanged, without requiring full end-to-end gradients or task-specific tuning.","pith_extraction_headline":"TRIO reduces visual tokens in vision-language models to 11 percent while retaining 97 percent performance by selecting tokens whose removal leaves the final output unchanged."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"dc21b94d-e47f-4db3-a844-b27f49e2a308"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:16Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"olZT+66OzGZU2i1fm5ifNLxptGqWdFon3LkXVP07Yod3A4hNZJBN/4Sq4SWuAI2aRHIJe1HTy8SoOujNmSToCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-02T09:59:01.103018Z"},"content_sha256":"dad78331721aa6fd329adca65b79fce1b384d638a26d5ccb02242872d93197cf","schema_version":"1.0","event_id":"sha256:dad78331721aa6fd329adca65b79fce1b384d638a26d5ccb02242872d93197cf"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/BLQDQJEISWHP334A43JMKP6TMH/bundle.json","state_url":"https://pith.science/pith/BLQDQJEISWHP334A43JMKP6TMH/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/BLQDQJEISWHP334A43JMKP6TMH/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-07-02T09:59:01Z","links":{"resolver":"https://pith.science/pith/BLQDQJEISWHP334A43JMKP6TMH","bundle":"https://pith.science/pith/BLQDQJEISWHP334A43JMKP6TMH/bundle.json","state":"https://pith.science/pith/BLQDQJEISWHP334A43JMKP6TMH/state.json","well_known_bundle":"https://pith.science/.well-known/pith/BLQDQJEISWHP334A43JMKP6TMH/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:BLQDQJEISWHP334A43JMKP6TMH","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"274d100576cf7ef06b3118ef63ceb4875086938a2e4827733a3a3a6e1290afd5","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-02-04T15:33:10Z","title_canon_sha256":"9ec91120c352d9e8cf72dcdd2ab0f3419da85bff1dcbf048010ee28f72d3ec88"},"schema_version":"1.0","source":{"id":"2602.04657","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.04657","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"arxiv_version","alias_value":"2602.04657v3","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.04657","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"pith_short_12","alias_value":"BLQDQJEISWHP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BLQDQJEISWHP334A","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BLQDQJEI","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:dad78331721aa6fd329adca65b79fce1b384d638a26d5ccb02242872d93197cf","target":"graph","created_at":"2026-05-17T23:39:16Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"On LLaVA-Next-7B, TRIO retains just 11.1% of visual tokens but maintains 97.2% of the original performance, with a 2.75× prefill speedup, 2.14× inference speedup, 6.22× lower FLOPs, and 6.05× reduced KV Cache overhead."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the designed layer-local proxy loss produces token-level gradient saliency that reliably identifies tokens whose removal leaves the final output essentially unchanged, without requiring full end-to-end gradients or task-specific tuning."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"TRIO keeps 97.2% performance on LLaVA-Next-7B using only 11.1% visual tokens, yielding 2.75x prefill speedup and 6x lower FLOPs via inference-objective gradient guidance."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"TRIO reduces visual tokens in vision-language models to 11 percent while retaining 97 percent performance by selecting tokens whose removal leaves the final output unchanged."}],"snapshot_sha256":"b6987d672ca0113a476f0d9489906d44b123b6df893a7c7a0d750c21381d41d4"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Recently, reducing redundant visual tokens in vision-language models (VLMs) to accelerate VLM inference has emerged as a hot topic. However, most existing methods rely on heuristics constructed based on inter-visual-token similarity or cross-modal visual-text similarity, which gives rise to certain limitations in compression performance and practical deployment. In contrast, we propose TRIO from the perspective of inference objectives, which transforms visual token compression into preserving output result invariance and selects tokens primarily by their importance to this goal. Specifically, ","authors_text":"Congyang Ou, Dawei Yan, Haokui Zhang, Peng Wang, Qingsen Yan, Rong Xiao, Ying Li, Yu Zhang","cross_cats":[],"headline":"TRIO reduces visual tokens in vision-language models to 11 percent while retaining 97 percent performance by selecting tokens whose removal leaves the final output unchanged.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-02-04T15:33:10Z","title":"TRIO: Token Reduction via Inference-Objective Guidance for Efficient Vision-Language Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.04657","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-16T07:24:57.792012Z","id":"dc21b94d-e47f-4db3-a844-b27f49e2a308","model_set":{"reader":"grok-4.3"},"one_line_summary":"TRIO keeps 97.2% performance on LLaVA-Next-7B using only 11.1% visual tokens, yielding 2.75x prefill speedup and 6x lower FLOPs via inference-objective gradient guidance.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"TRIO reduces visual tokens in vision-language models to 11 percent while retaining 97 percent performance by selecting tokens whose removal leaves the final output unchanged.","strongest_claim":"On LLaVA-Next-7B, TRIO retains just 11.1% of visual tokens but maintains 97.2% of the original performance, with a 2.75× prefill speedup, 2.14× inference speedup, 6.22× lower FLOPs, and 6.05× reduced KV Cache overhead.","weakest_assumption":"That the designed layer-local proxy loss produces token-level gradient saliency that reliably identifies tokens whose removal leaves the final output essentially unchanged, without requiring full end-to-end gradients or task-specific tuning."}},"verdict_id":"dc21b94d-e47f-4db3-a844-b27f49e2a308"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:e773bc7c4ff35b20ddd7d92b8419c2e07fb62d3044082d4955690526895f8552","target":"record","created_at":"2026-05-17T23:39:16Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"274d100576cf7ef06b3118ef63ceb4875086938a2e4827733a3a3a6e1290afd5","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-02-04T15:33:10Z","title_canon_sha256":"9ec91120c352d9e8cf72dcdd2ab0f3419da85bff1dcbf048010ee28f72d3ec88"},"schema_version":"1.0","source":{"id":"2602.04657","kind":"arxiv","version":3}},"canonical_sha256":"0ae0382488958efdef80e6d2c53fd361f5351c15ead05b1e0d2bfc760240441d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0ae0382488958efdef80e6d2c53fd361f5351c15ead05b1e0d2bfc760240441d","first_computed_at":"2026-05-17T23:39:16.348136Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:16.348136Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"mOZYunNxaCz3T16GcFmVhPw/NllDtrjy9PoliPgXchUGxGzpECjesTLPkrqSkYT5/m744toPnhLIXhZwyKasDQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:16.348839Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.04657","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:e773bc7c4ff35b20ddd7d92b8419c2e07fb62d3044082d4955690526895f8552","sha256:dad78331721aa6fd329adca65b79fce1b384d638a26d5ccb02242872d93197cf"],"state_sha256":"7b5f57647224c36bffc23dda8428b0efd280fb2170191955ac08ea25f1694a1d"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"pAqYmoSMQfIZGWHxg/qTO9VBQ43uqrGJMviFcdpsRvB/CAl+k6fVMrNiMuIbbDFmughML/pk8mmK+fpz3R0XDw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-07-02T09:59:01.105264Z","bundle_sha256":"72342f392fad50411a3f281eb06f669e77bb6f2493799fd1ad9c5e2180db7aae"}}