{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:BWIAXPTYFCWBSFU2474RYSMEMR","short_pith_number":"pith:BWIAXPTY","canonical_record":{"source":{"id":"2406.10774","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-06-16T01:33:02Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"d3fe8b74b07aadd7ad629906ecb00045739da53c44bcccc04a230ee451b8b27b","abstract_canon_sha256":"f1e057c54ef5bad63ddb13e05dfd763685b60792f4462cce20f8292321c5e5bb"},"schema_version":"1.0"},"canonical_sha256":"0d900bbe7828ac19169ae7f91c49846449e053ca56781d809c9ea37308fe874f","source":{"kind":"arxiv","id":"2406.10774","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2406.10774","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"arxiv_version","alias_value":"2406.10774v2","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.10774","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"pith_short_12","alias_value":"BWIAXPTYFCWB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BWIAXPTYFCWBSFU2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BWIAXPTY","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:BWIAXPTYFCWBSFU2474RYSMEMR","target":"record","payload":{"canonical_record":{"source":{"id":"2406.10774","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-06-16T01:33:02Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"d3fe8b74b07aadd7ad629906ecb00045739da53c44bcccc04a230ee451b8b27b","abstract_canon_sha256":"f1e057c54ef5bad63ddb13e05dfd763685b60792f4462cce20f8292321c5e5bb"},"schema_version":"1.0"},"canonical_sha256":"0d900bbe7828ac19169ae7f91c49846449e053ca56781d809c9ea37308fe874f","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:52.376057Z","signature_b64":"BwFxHeAIP7DRZH0tdrVZsaLkoq4zUH2mnfTEhLEOJGrNheGlsSGiN8wndHv+VgiKj/9tWwF8IZiPnDc/RNmsDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0d900bbe7828ac19169ae7f91c49846449e053ca56781d809c9ea37308fe874f","last_reissued_at":"2026-05-17T23:38:52.375607Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:52.375607Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2406.10774","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:52Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"T1d7OWPSsHYT3erOmG04TMlSgBTwlp7evAMosNyb+UNwfxeoVATWCJQGQGMB/OG5oTh2jwznQjebhDcoB1bwCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-06T02:42:14.406325Z"},"content_sha256":"c04ec6e98b1226e7d3bdc8e1986d1c02c415daff7dd2347ab6a4cc0f8585ed01","schema_version":"1.0","event_id":"sha256:c04ec6e98b1226e7d3bdc8e1986d1c02c415daff7dd2347ab6a4cc0f8585ed01"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:BWIAXPTYFCWBSFU2474RYSMEMR","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention.","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Baris Kasikci, Guangxuan Xiao, Jiaming Tang, Kan Zhu, Song Han, Yilong Zhao","submitted_at":"2024-06-16T01:33:02Z","abstract_excerpt":"As the demand for long-context large language models (LLMs) increases, models with context windows of up to 128K or 1M tokens are becoming increasingly prevalent. However, long-context LLM inference is challenging since the inference speed decreases significantly as the sequence length grows. This slowdown is primarily caused by loading a large KV cache during self-attention. Previous works have shown that a small portion of critical tokens will dominate the attention outcomes. However, we observe the criticality of a token highly depends on the query. To this end, we propose Quest, a query-aw"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"By only loading the Top-K critical KV cache pages for attention, Quest significantly speeds up self-attention without sacrificing accuracy. We show that Quest can achieve up to 2.23x self-attention speedup, which reduces inference latency by 7.03x while performing well on tasks with long dependencies with negligible accuracy loss.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the min/max key approximation per page, combined with query-vector scoring, reliably identifies the truly critical pages without dropping information that would change the final attention output on long-dependency tasks.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Quest speeds up long-context LLM self-attention by up to 2.23x via query-dependent selection of top-K critical KV cache pages, cutting overall latency by 7.03x with negligible accuracy loss.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"e17bf82a2fb63efa58f980c030e59b23bb06db738800a83ea02d5e926b469b52"},"source":{"id":"2406.10774","kind":"arxiv","version":2},"verdict":{"id":"8f5d9e8d-d7dd-47a6-91d0-2b9b803af69f","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T14:04:56.497780Z","strongest_claim":"By only loading the Top-K critical KV cache pages for attention, Quest significantly speeds up self-attention without sacrificing accuracy. We show that Quest can achieve up to 2.23x self-attention speedup, which reduces inference latency by 7.03x while performing well on tasks with long dependencies with negligible accuracy loss.","one_line_summary":"Quest speeds up long-context LLM self-attention by up to 2.23x via query-dependent selection of top-K critical KV cache pages, cutting overall latency by 7.03x with negligible accuracy loss.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the min/max key approximation per page, combined with query-vector scoring, reliably identifies the truly critical pages without dropping information that would change the final attention output on long-dependency tasks.","pith_extraction_headline":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention."},"references":{"count":72,"sample":[{"doi":"","year":2024,"title":"I ntroducing the next generation of C laude","work_id":"c7fa93db-1d56-45f0-9974-a574f30b2ab1","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Longbench: A bilingual, multitask benchmark for long context understanding, 2023","work_id":"f3203315-6a3d-43f9-bcf6-e6e35491b6d8","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Y., Ermon, S., Rudra, A., and Ré, C","work_id":"1a200b80-d0ee-4c53-b99a-b3acee1d6777","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"A., and Gardner, M","work_id":"5c114c15-2f4d-40c1-be92-e0aa233c38f2","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Model tells you what to discard: Adaptive kv cache compression for llms, 2024","work_id":"f3be5c3d-0cd9-483d-b8d7-8f79555b0294","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":72,"snapshot_sha256":"f73df1206bea4501fe32d4ec39db4fdd570bf4d28259bc5ec322c1b40005c80d","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"8f5d9e8d-d7dd-47a6-91d0-2b9b803af69f"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:52Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"T1vB+RAUPxVlAuSm0AzZBjhX9OYtkvV2VcRG6IGAsrwt6AR3xv1W13DcFL91lh6EYhIjgewExyGZwzak9eJgCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-06T02:42:14.407295Z"},"content_sha256":"c0f1f83b3c42494387f45d8dde7a997e6dbacf0b2c659806288b1fcd2d3033e8","schema_version":"1.0","event_id":"sha256:c0f1f83b3c42494387f45d8dde7a997e6dbacf0b2c659806288b1fcd2d3033e8"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR/bundle.json","state_url":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/BWIAXPTYFCWBSFU2474RYSMEMR/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-06T02:42:14Z","links":{"resolver":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR","bundle":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR/bundle.json","state":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR/state.json","well_known_bundle":"https://pith.science/.well-known/pith/BWIAXPTYFCWBSFU2474RYSMEMR/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:BWIAXPTYFCWBSFU2474RYSMEMR","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f1e057c54ef5bad63ddb13e05dfd763685b60792f4462cce20f8292321c5e5bb","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-06-16T01:33:02Z","title_canon_sha256":"d3fe8b74b07aadd7ad629906ecb00045739da53c44bcccc04a230ee451b8b27b"},"schema_version":"1.0","source":{"id":"2406.10774","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2406.10774","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"arxiv_version","alias_value":"2406.10774v2","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.10774","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"pith_short_12","alias_value":"BWIAXPTYFCWB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BWIAXPTYFCWBSFU2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BWIAXPTY","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:c0f1f83b3c42494387f45d8dde7a997e6dbacf0b2c659806288b1fcd2d3033e8","target":"graph","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"By only loading the Top-K critical KV cache pages for attention, Quest significantly speeds up self-attention without sacrificing accuracy. We show that Quest can achieve up to 2.23x self-attention speedup, which reduces inference latency by 7.03x while performing well on tasks with long dependencies with negligible accuracy loss."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the min/max key approximation per page, combined with query-vector scoring, reliably identifies the truly critical pages without dropping information that would change the final attention output on long-dependency tasks."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Quest speeds up long-context LLM self-attention by up to 2.23x via query-dependent selection of top-K critical KV cache pages, cutting overall latency by 7.03x with negligible accuracy loss."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention."}],"snapshot_sha256":"e17bf82a2fb63efa58f980c030e59b23bb06db738800a83ea02d5e926b469b52"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"As the demand for long-context large language models (LLMs) increases, models with context windows of up to 128K or 1M tokens are becoming increasingly prevalent. However, long-context LLM inference is challenging since the inference speed decreases significantly as the sequence length grows. This slowdown is primarily caused by loading a large KV cache during self-attention. Previous works have shown that a small portion of critical tokens will dominate the attention outcomes. However, we observe the criticality of a token highly depends on the query. To this end, we propose Quest, a query-aw","authors_text":"Baris Kasikci, Guangxuan Xiao, Jiaming Tang, Kan Zhu, Song Han, Yilong Zhao","cross_cats":["cs.LG"],"headline":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-06-16T01:33:02Z","title":"Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference"},"references":{"count":72,"internal_anchors":0,"resolved_work":72,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"I ntroducing the next generation of C laude","work_id":"c7fa93db-1d56-45f0-9974-a574f30b2ab1","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Longbench: A bilingual, multitask benchmark for long context understanding, 2023","work_id":"f3203315-6a3d-43f9-bcf6-e6e35491b6d8","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Y., Ermon, S., Rudra, A., and Ré, C","work_id":"1a200b80-d0ee-4c53-b99a-b3acee1d6777","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"A., and Gardner, M","work_id":"5c114c15-2f4d-40c1-be92-e0aa233c38f2","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Model tells you what to discard: Adaptive kv cache compression for llms, 2024","work_id":"f3be5c3d-0cd9-483d-b8d7-8f79555b0294","year":2024}],"snapshot_sha256":"f73df1206bea4501fe32d4ec39db4fdd570bf4d28259bc5ec322c1b40005c80d"},"source":{"id":"2406.10774","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T14:04:56.497780Z","id":"8f5d9e8d-d7dd-47a6-91d0-2b9b803af69f","model_set":{"reader":"grok-4.3"},"one_line_summary":"Quest speeds up long-context LLM self-attention by up to 2.23x via query-dependent selection of top-K critical KV cache pages, cutting overall latency by 7.03x with negligible accuracy loss.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention.","strongest_claim":"By only loading the Top-K critical KV cache pages for attention, Quest significantly speeds up self-attention without sacrificing accuracy. We show that Quest can achieve up to 2.23x self-attention speedup, which reduces inference latency by 7.03x while performing well on tasks with long dependencies with negligible accuracy loss.","weakest_assumption":"That the min/max key approximation per page, combined with query-vector scoring, reliably identifies the truly critical pages without dropping information that would change the final attention output on long-dependency tasks."}},"verdict_id":"8f5d9e8d-d7dd-47a6-91d0-2b9b803af69f"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:c04ec6e98b1226e7d3bdc8e1986d1c02c415daff7dd2347ab6a4cc0f8585ed01","target":"record","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f1e057c54ef5bad63ddb13e05dfd763685b60792f4462cce20f8292321c5e5bb","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-06-16T01:33:02Z","title_canon_sha256":"d3fe8b74b07aadd7ad629906ecb00045739da53c44bcccc04a230ee451b8b27b"},"schema_version":"1.0","source":{"id":"2406.10774","kind":"arxiv","version":2}},"canonical_sha256":"0d900bbe7828ac19169ae7f91c49846449e053ca56781d809c9ea37308fe874f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0d900bbe7828ac19169ae7f91c49846449e053ca56781d809c9ea37308fe874f","first_computed_at":"2026-05-17T23:38:52.375607Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:52.375607Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"BwFxHeAIP7DRZH0tdrVZsaLkoq4zUH2mnfTEhLEOJGrNheGlsSGiN8wndHv+VgiKj/9tWwF8IZiPnDc/RNmsDA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:52.376057Z","signed_message":"canonical_sha256_bytes"},"source_id":"2406.10774","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:c04ec6e98b1226e7d3bdc8e1986d1c02c415daff7dd2347ab6a4cc0f8585ed01","sha256:c0f1f83b3c42494387f45d8dde7a997e6dbacf0b2c659806288b1fcd2d3033e8"],"state_sha256":"d94b3fef9fa292c03fdbcd401a9dfbdf31675489be87edbd278f9f59de9d0ab0"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"nVvBbAhJGu9YbtCMUsrqeR7INJgT9aOvPRMGUQXuncPh7uucOa1MOY4g3ArRftd6nOne+bSNlUG9w5zNI5BLCQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-06T02:42:14.411430Z","bundle_sha256":"d69d7d30175163e8df3074a7302387aa5081c5e9d4c6cc2a833edbad19d0e712"}}