{"work":{"id":"2cad64c9-e2d5-42b6-8db9-03fafde4bcb0","openalex_id":null,"doi":null,"arxiv_id":"2406.10774","raw_key":null,"title":"Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference","authors":null,"authors_text":"Jiaming Tang, Yilong Zhao, Kan Zhu, Guangxuan Xiao, Baris Kasikci, Song Han","year":2024,"venue":"cs.CL","abstract":"As the demand for long-context large language models (LLMs) increases, models with context windows of up to 128K or 1M tokens are becoming increasingly prevalent. However, long-context LLM inference is challenging since the inference speed decreases significantly as the sequence length grows. This slowdown is primarily caused by loading a large KV cache during self-attention. Previous works have shown that a small portion of critical tokens will dominate the attention outcomes. However, we observe the criticality of a token highly depends on the query. To this end, we propose Quest, a query-aware KV cache selection algorithm. Quest keeps track of the minimal and maximal Key values in KV cache pages and estimates the criticality of a given page using Query vectors. By only loading the Top-K critical KV cache pages for attention, Quest significantly speeds up self-attention without sacrificing accuracy. We show that Quest can achieve up to 2.23x self-attention speedup, which reduces inference latency by 7.03x while performing well on tasks with long dependencies with negligible accuracy loss. Code is available at http://github.com/mit-han-lab/Quest .","external_url":"https://arxiv.org/abs/2406.10774","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-23T04:17:31.050538+00:00","pith_arxiv_id":"2406.10774","created_at":"2026-05-09T06:15:37.590081+00:00","updated_at":"2026-05-23T04:17:31.050538+00:00","title_quality_ok":true,"display_title":"Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference","render_title":"Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference"},"hub":{"state":{"work_id":"2cad64c9-e2d5-42b6-8db9-03fafde4bcb0","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":42,"external_cited_by_count":null,"distinct_field_count":10,"first_pith_cited_at":"2025-02-04T02:23:06+00:00","last_pith_cited_at":"2026-05-21T16:55:04+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-28T17:38:49.609670+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":4},{"context_role":"baseline","n":1},{"context_role":"method","n":1}],"polarity_counts":[{"context_polarity":"background","n":4},{"context_polarity":"baseline","n":1},{"context_polarity":"use_method","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}