{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:XG4YV6DYIGMNHPUDZTSAQ4NYN6","short_pith_number":"pith:XG4YV6DY","schema_version":"1.0","canonical_sha256":"b9b98af8784198d3be83cce40871b86f97d10b795b0a551c795c8e3556b7273c","source":{"kind":"arxiv","id":"2605.13734","version":1},"attestation_state":"computed","paper":{"title":"KVServe: Service-Aware KV Cache Compression for Communication-Efficient Disaggregated LLM Serving","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"KVServe uses service-aware adaptive KV cache compression to cut latency bottlenecks in disaggregated LLM serving","cross_cats":["cs.AI","cs.NI"],"primary_cat":"cs.DC","authors_text":"Bing Lu, Dejun Luo, Dingwen Tao, Guangming Tan, Hairui Zhao, Jinyang Liu, Wenjing Huang, Xingchen Liu, Xinyang Ma, Yida Gu, Zedong Liu, Zheng Wei","submitted_at":"2026-05-13T16:12:33Z","abstract_excerpt":"LLMs are widely adopted in production, pushing inference systems to their limits. Disaggregated LLM serving (e.g., PD separation and KV state disaggregation) improves scalability and cost efficiency, but it also turns KV into an explicit payload crossing network and storage boundaries, making KV a dominant end-to-end bottleneck. Existing KV compression are typically static runtime configurations, despite production service context varies over time in workload mix, bandwidth, and SLO/quality budgets. As a result, a fixed choice can be suboptimal or even increase latency. We present \\emph{KVServ"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.13734","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DC","submitted_at":"2026-05-13T16:12:33Z","cross_cats_sorted":["cs.AI","cs.NI"],"title_canon_sha256":"abfac4b257da8df3eb18636150d96c40a12516311e307c3c4d79a7228842affa","abstract_canon_sha256":"374391d0768aa528347fcdcd4f33b048d216928c6e5bbaf646f47adf9121ae1a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:44:16.531101Z","signature_b64":"OF5OxOIbVRH5ZHC0kMvjbGg0O3/OS87rmde9LXdgTQSpEjtI1LB3lJ6NUQw8FUuo5fKnnuF8xB3PoaZ1GN4WBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b9b98af8784198d3be83cce40871b86f97d10b795b0a551c795c8e3556b7273c","last_reissued_at":"2026-05-18T02:44:16.530580Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:44:16.530580Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"KVServe: Service-Aware KV Cache Compression for Communication-Efficient Disaggregated LLM Serving","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"KVServe uses service-aware adaptive KV cache compression to cut latency bottlenecks in disaggregated LLM serving","cross_cats":["cs.AI","cs.NI"],"primary_cat":"cs.DC","authors_text":"Bing Lu, Dejun Luo, Dingwen Tao, Guangming Tan, Hairui Zhao, Jinyang Liu, Wenjing Huang, Xingchen Liu, Xinyang Ma, Yida Gu, Zedong Liu, Zheng Wei","submitted_at":"2026-05-13T16:12:33Z","abstract_excerpt":"LLMs are widely adopted in production, pushing inference systems to their limits. Disaggregated LLM serving (e.g., PD separation and KV state disaggregation) improves scalability and cost efficiency, but it also turns KV into an explicit payload crossing network and storage boundaries, making KV a dominant end-to-end bottleneck. Existing KV compression are typically static runtime configurations, despite production service context varies over time in workload mix, bandwidth, and SLO/quality budgets. As a result, a fixed choice can be suboptimal or even increase latency. We present \\emph{KVServ"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"KVServe achieves up to 9.13× JCT speedup in PD-separated serving and up to 32.8× TTFT reduction in KV-disaggregated serving through its service-aware adaptive compression framework.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The analytical latency model combined with the bandit controller reliably selects profiles that match real-world performance despite offline-to-online mismatch, without introducing unacceptable quality degradation under varying SLO budgets.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"KVServe delivers up to 9.13x job completion time speedup and 32.8x time-to-first-token reduction by making KV cache compression service-aware and adaptive in disaggregated LLM serving.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"KVServe uses service-aware adaptive KV cache compression to cut latency bottlenecks in disaggregated LLM serving","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"e577c974a2cee2dff94cf580c73ccb1142d973c53a4478a0566213598ab2fb1a"},"source":{"id":"2605.13734","kind":"arxiv","version":1},"verdict":{"id":"6b9b5f83-3e67-4363-9809-b951a09171cb","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T17:39:05.257613Z","strongest_claim":"KVServe achieves up to 9.13× JCT speedup in PD-separated serving and up to 32.8× TTFT reduction in KV-disaggregated serving through its service-aware adaptive compression framework.","one_line_summary":"KVServe delivers up to 9.13x job completion time speedup and 32.8x time-to-first-token reduction by making KV cache compression service-aware and adaptive in disaggregated LLM serving.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The analytical latency model combined with the bandit controller reliably selects profiles that match real-world performance despite offline-to-online mismatch, without introducing unacceptable quality degradation under varying SLO budgets.","pith_extraction_headline":"KVServe uses service-aware adaptive KV cache compression to cut latency bottlenecks in disaggregated LLM serving"},"references":{"count":55,"sample":[{"doi":"","year":2026,"title":"Amazon Web Services. 2026. Amazon EC2 FAQs. https://aws.amazon. com/ec2/faqs/. (2026). Accessed: 2026-01-29","work_id":"47e8a433-3786-4c18-a0cd-75ea4c255f32","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Muhammad Arslan, Hussam Ghanem, Saba Munawar, and Christophe Cruz. 2024. A Survey on RAG with LLMs.Procedia computer science 246 (2024), 3781–3790","work_id":"ba8c509e-296c-48ed-9798-73b659a2cb72","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Saleh Ashkboos, Amirkeivan Mohtashami, Maximilian L Croci, Bo Li, Pashmina Cameron, Martin Jaggi, Dan Alistarh, Torsten Hoefler, and James Hensman. 2024. Quarot: Outlier-free 4-bit inference in rotate","work_id":"acae88f5-85b4-4438-bb2d-b092bb7c64e9","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"LongBench: A Bilingual, Multitask Benchmark for Long Context Understanding","work_id":"ba7831c4-9427-4e0e-a5c1-4e98511f4b53","ref_index":4,"cited_arxiv_id":"2308.14508","is_internal_anchor":true},{"doi":"","year":2021,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","ref_index":5,"cited_arxiv_id":"2107.03374","is_internal_anchor":true}],"resolved_work":55,"snapshot_sha256":"672f03169300c43fad175bc1e262148f2b266556742d526dd529fdc8772342bc","internal_anchors":8},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.13734","created_at":"2026-05-18T02:44:16.530681+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.13734v1","created_at":"2026-05-18T02:44:16.530681+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13734","created_at":"2026-05-18T02:44:16.530681+00:00"},{"alias_kind":"pith_short_12","alias_value":"XG4YV6DYIGMN","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"XG4YV6DYIGMNHPUD","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"XG4YV6DY","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XG4YV6DYIGMNHPUDZTSAQ4NYN6","json":"https://pith.science/pith/XG4YV6DYIGMNHPUDZTSAQ4NYN6.json","graph_json":"https://pith.science/api/pith-number/XG4YV6DYIGMNHPUDZTSAQ4NYN6/graph.json","events_json":"https://pith.science/api/pith-number/XG4YV6DYIGMNHPUDZTSAQ4NYN6/events.json","paper":"https://pith.science/paper/XG4YV6DY"},"agent_actions":{"view_html":"https://pith.science/pith/XG4YV6DYIGMNHPUDZTSAQ4NYN6","download_json":"https://pith.science/pith/XG4YV6DYIGMNHPUDZTSAQ4NYN6.json","view_paper":"https://pith.science/paper/XG4YV6DY","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.13734&json=true","fetch_graph":"https://pith.science/api/pith-number/XG4YV6DYIGMNHPUDZTSAQ4NYN6/graph.json","fetch_events":"https://pith.science/api/pith-number/XG4YV6DYIGMNHPUDZTSAQ4NYN6/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XG4YV6DYIGMNHPUDZTSAQ4NYN6/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XG4YV6DYIGMNHPUDZTSAQ4NYN6/action/storage_attestation","attest_author":"https://pith.science/pith/XG4YV6DYIGMNHPUDZTSAQ4NYN6/action/author_attestation","sign_citation":"https://pith.science/pith/XG4YV6DYIGMNHPUDZTSAQ4NYN6/action/citation_signature","submit_replication":"https://pith.science/pith/XG4YV6DYIGMNHPUDZTSAQ4NYN6/action/replication_record"}},"created_at":"2026-05-18T02:44:16.530681+00:00","updated_at":"2026-05-18T02:44:16.530681+00:00"}