{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:JGPU576JKTEZBCLJT3SJI5ZLFT","short_pith_number":"pith:JGPU576J","schema_version":"1.0","canonical_sha256":"499f4effc954c99089699ee494772b2cddf3f197f96024b71d364255fe3a4ac3","source":{"kind":"arxiv","id":"2606.24467","version":1},"attestation_state":"computed","paper":{"title":"CompressKV: Semantic-Retrieval-Guided KV-Cache Compression for Resource-Efficient Long-Context LLM Inference","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Bing Li, Grace Li Zhang, Jingcun Wang, Olga Kondrateva, Xiaolin Lin, Yiyu Shi","submitted_at":"2026-06-23T11:59:46Z","abstract_excerpt":"Long-context large language model (LLM) inference is increasingly constrained by the memory footprint and decoding cost of key-value (KV) caches, limiting sustainable deployment on resource-constrained hardware. Existing KV cache eviction methods typically apply heuristic token scoring over all heads in GQA-based LLMs. These methods ignore the different functionalities of attention heads, leading to the eviction of critical tokens and thus degrading the performance of LLMs. To address this issue, we propose CompressKV, a resource-efficient KV-cache compression framework for GQA-based LLMs. Ins"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.24467","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-23T11:59:46Z","cross_cats_sorted":[],"title_canon_sha256":"a657ba5d3e376fb32d12fb594c67e9b560ab7be2eb1202bbb69d8b9618fcf088","abstract_canon_sha256":"e221a68829e6e17c2cc07c25b0c11633175569cb050031f5a29ba9582acd5bae"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-24T01:15:31.193469Z","signature_b64":"kw4yVCU1d17fnbOC8mh3RvbG29mVMDBlVuKXvpL05vyr6c7vevmCbVCv9vwyl5rm/dCaUV+WmMyHO24tE9C3BQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"499f4effc954c99089699ee494772b2cddf3f197f96024b71d364255fe3a4ac3","last_reissued_at":"2026-06-24T01:15:31.193102Z","signature_status":"signed_v1","first_computed_at":"2026-06-24T01:15:31.193102Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"CompressKV: Semantic-Retrieval-Guided KV-Cache Compression for Resource-Efficient Long-Context LLM Inference","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Bing Li, Grace Li Zhang, Jingcun Wang, Olga Kondrateva, Xiaolin Lin, Yiyu Shi","submitted_at":"2026-06-23T11:59:46Z","abstract_excerpt":"Long-context large language model (LLM) inference is increasingly constrained by the memory footprint and decoding cost of key-value (KV) caches, limiting sustainable deployment on resource-constrained hardware. Existing KV cache eviction methods typically apply heuristic token scoring over all heads in GQA-based LLMs. These methods ignore the different functionalities of attention heads, leading to the eviction of critical tokens and thus degrading the performance of LLMs. To address this issue, we propose CompressKV, a resource-efficient KV-cache compression framework for GQA-based LLMs. Ins"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.24467","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.24467/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.24467","created_at":"2026-06-24T01:15:31.193164+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.24467v1","created_at":"2026-06-24T01:15:31.193164+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.24467","created_at":"2026-06-24T01:15:31.193164+00:00"},{"alias_kind":"pith_short_12","alias_value":"JGPU576JKTEZ","created_at":"2026-06-24T01:15:31.193164+00:00"},{"alias_kind":"pith_short_16","alias_value":"JGPU576JKTEZBCLJ","created_at":"2026-06-24T01:15:31.193164+00:00"},{"alias_kind":"pith_short_8","alias_value":"JGPU576J","created_at":"2026-06-24T01:15:31.193164+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/JGPU576JKTEZBCLJT3SJI5ZLFT","json":"https://pith.science/pith/JGPU576JKTEZBCLJT3SJI5ZLFT.json","graph_json":"https://pith.science/api/pith-number/JGPU576JKTEZBCLJT3SJI5ZLFT/graph.json","events_json":"https://pith.science/api/pith-number/JGPU576JKTEZBCLJT3SJI5ZLFT/events.json","paper":"https://pith.science/paper/JGPU576J"},"agent_actions":{"view_html":"https://pith.science/pith/JGPU576JKTEZBCLJT3SJI5ZLFT","download_json":"https://pith.science/pith/JGPU576JKTEZBCLJT3SJI5ZLFT.json","view_paper":"https://pith.science/paper/JGPU576J","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.24467&json=true","fetch_graph":"https://pith.science/api/pith-number/JGPU576JKTEZBCLJT3SJI5ZLFT/graph.json","fetch_events":"https://pith.science/api/pith-number/JGPU576JKTEZBCLJT3SJI5ZLFT/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/JGPU576JKTEZBCLJT3SJI5ZLFT/action/timestamp_anchor","attest_storage":"https://pith.science/pith/JGPU576JKTEZBCLJT3SJI5ZLFT/action/storage_attestation","attest_author":"https://pith.science/pith/JGPU576JKTEZBCLJT3SJI5ZLFT/action/author_attestation","sign_citation":"https://pith.science/pith/JGPU576JKTEZBCLJT3SJI5ZLFT/action/citation_signature","submit_replication":"https://pith.science/pith/JGPU576JKTEZBCLJT3SJI5ZLFT/action/replication_record"}},"created_at":"2026-06-24T01:15:31.193164+00:00","updated_at":"2026-06-24T01:15:31.193164+00:00"}