{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:7NFM733K67POPV6KFZSVDDHNHS","short_pith_number":"pith:7NFM733K","schema_version":"1.0","canonical_sha256":"fb4acfef6af7dee7d7ca2e65518ced3cbbf7028efbfc68cfcef66329f00bead7","source":{"kind":"arxiv","id":"2602.23200","version":2},"attestation_state":"computed","paper":{"title":"InnerQ: Hardware-Aware Tuning-Free Quantization of KV Cache for Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.LG","authors_text":"Amir Ardakani, Sayed Mohammadreza Tayaranian Hosseini, Warren J. Gross","submitted_at":"2026-02-26T16:50:36Z","abstract_excerpt":"When transformer-based language models are deployed for text generation, most of the inference time is spent in the decoding stage, where output tokens are generated sequentially. Reducing the hardware cost of each decoding step is therefore critical for efficient long-context generation. A major bottleneck is the key-value (KV) cache, whose size grows with sequence length and often dominates the model's memory footprint. Prior work has proposed quantization methods to compress the KV cache while minimizing its loss of precision. We present InnerQ, a hardware-aware KV cache quantization scheme"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2602.23200","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-26T16:50:36Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"bf071579f7edbe026a49975900bdd82cc6928c775277f70bb0a5250b7cae76ae","abstract_canon_sha256":"6dbd8fcb493ca7480b46baad824149ef4622096e507476ac9ee6cddd2dfce16f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T01:03:18.604612Z","signature_b64":"KnE4qe53cLTJdovuXxzVLW8qXR2iPe+Ju6xCd6nv/rkCqedvQsNLMll9KeJsSI9/eSl6qgWkjGopmXapNXeyBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fb4acfef6af7dee7d7ca2e65518ced3cbbf7028efbfc68cfcef66329f00bead7","last_reissued_at":"2026-05-22T01:03:18.603750Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T01:03:18.603750Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"InnerQ: Hardware-Aware Tuning-Free Quantization of KV Cache for Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.LG","authors_text":"Amir Ardakani, Sayed Mohammadreza Tayaranian Hosseini, Warren J. Gross","submitted_at":"2026-02-26T16:50:36Z","abstract_excerpt":"When transformer-based language models are deployed for text generation, most of the inference time is spent in the decoding stage, where output tokens are generated sequentially. Reducing the hardware cost of each decoding step is therefore critical for efficient long-context generation. A major bottleneck is the key-value (KV) cache, whose size grows with sequence length and often dominates the model's memory footprint. Prior work has proposed quantization methods to compress the KV cache while minimizing its loss of precision. We present InnerQ, a hardware-aware KV cache quantization scheme"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.23200","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.23200/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2602.23200","created_at":"2026-05-22T01:03:18.603853+00:00"},{"alias_kind":"arxiv_version","alias_value":"2602.23200v2","created_at":"2026-05-22T01:03:18.603853+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.23200","created_at":"2026-05-22T01:03:18.603853+00:00"},{"alias_kind":"pith_short_12","alias_value":"7NFM733K67PO","created_at":"2026-05-22T01:03:18.603853+00:00"},{"alias_kind":"pith_short_16","alias_value":"7NFM733K67POPV6K","created_at":"2026-05-22T01:03:18.603853+00:00"},{"alias_kind":"pith_short_8","alias_value":"7NFM733K","created_at":"2026-05-22T01:03:18.603853+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2605.20868","citing_title":"Runtime-Certified Bounded-Error Quantized Attention","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17887","citing_title":"Attention Sinks and Outliers in Attention Residuals","ref_index":6,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7NFM733K67POPV6KFZSVDDHNHS","json":"https://pith.science/pith/7NFM733K67POPV6KFZSVDDHNHS.json","graph_json":"https://pith.science/api/pith-number/7NFM733K67POPV6KFZSVDDHNHS/graph.json","events_json":"https://pith.science/api/pith-number/7NFM733K67POPV6KFZSVDDHNHS/events.json","paper":"https://pith.science/paper/7NFM733K"},"agent_actions":{"view_html":"https://pith.science/pith/7NFM733K67POPV6KFZSVDDHNHS","download_json":"https://pith.science/pith/7NFM733K67POPV6KFZSVDDHNHS.json","view_paper":"https://pith.science/paper/7NFM733K","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2602.23200&json=true","fetch_graph":"https://pith.science/api/pith-number/7NFM733K67POPV6KFZSVDDHNHS/graph.json","fetch_events":"https://pith.science/api/pith-number/7NFM733K67POPV6KFZSVDDHNHS/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7NFM733K67POPV6KFZSVDDHNHS/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7NFM733K67POPV6KFZSVDDHNHS/action/storage_attestation","attest_author":"https://pith.science/pith/7NFM733K67POPV6KFZSVDDHNHS/action/author_attestation","sign_citation":"https://pith.science/pith/7NFM733K67POPV6KFZSVDDHNHS/action/citation_signature","submit_replication":"https://pith.science/pith/7NFM733K67POPV6KFZSVDDHNHS/action/replication_record"}},"created_at":"2026-05-22T01:03:18.603853+00:00","updated_at":"2026-05-22T01:03:18.603853+00:00"}