{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:3S7CATW2EZ53OLE3NSAIJTRCQH","short_pith_number":"pith:3S7CATW2","schema_version":"1.0","canonical_sha256":"dcbe204eda267bb72c9b6c8084ce2281f756d19349783555a6c3767588a40f05","source":{"kind":"arxiv","id":"2601.20309","version":2},"attestation_state":"computed","paper":{"title":"SuperInfer: SLO-Aware Rotary Scheduling and Memory Management for LLM Inference on Superchips","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.DC","authors_text":"Jiahuan Yu, Mingtao Hu, Minjia Zhang, Zichao Lin","submitted_at":"2026-01-28T07:01:46Z","abstract_excerpt":"Large Language Model (LLM) serving faces a fundamental tension between stringent latency Service Level Objectives (SLOs) and limited GPU memory capacity. When high request rates exhaust the KV cache budget, existing LLM inference systems often suffer severe head-of-line (HOL) blocking. While prior work explored PCIe-based offloading, these approaches cannot sustain responsiveness under high request rates, often failing to meet tight Time-To-First-Token (TTFT) and Time-Between-Tokens (TBT) SLOs. We present SuperInfer, a high-performance LLM inference system designed for emerging Superchips (e.g"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2601.20309","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.DC","submitted_at":"2026-01-28T07:01:46Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"95444d2bcc4a327711855c0c507cd61acd7919e7c31760459a49ceba0e1c1cb0","abstract_canon_sha256":"eb54d30f0c668eb57544e840d29ab3097b67531bc5646563bff4a72505d6e220"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T01:05:07.088971Z","signature_b64":"z7z5ZLG28Ds0ZU/PLKzzrAG9JYi6Gh3t6apkyX+TolWLW12nMx1JuTcUgJpg+Sood5tAuBA9Qo5KT+yz5y+hBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"dcbe204eda267bb72c9b6c8084ce2281f756d19349783555a6c3767588a40f05","last_reissued_at":"2026-05-20T01:05:07.087917Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T01:05:07.087917Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"SuperInfer: SLO-Aware Rotary Scheduling and Memory Management for LLM Inference on Superchips","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.DC","authors_text":"Jiahuan Yu, Mingtao Hu, Minjia Zhang, Zichao Lin","submitted_at":"2026-01-28T07:01:46Z","abstract_excerpt":"Large Language Model (LLM) serving faces a fundamental tension between stringent latency Service Level Objectives (SLOs) and limited GPU memory capacity. When high request rates exhaust the KV cache budget, existing LLM inference systems often suffer severe head-of-line (HOL) blocking. While prior work explored PCIe-based offloading, these approaches cannot sustain responsiveness under high request rates, often failing to meet tight Time-To-First-Token (TTFT) and Time-Between-Tokens (TBT) SLOs. We present SuperInfer, a high-performance LLM inference system designed for emerging Superchips (e.g"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.20309","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.20309/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2601.20309","created_at":"2026-05-20T01:05:07.088071+00:00"},{"alias_kind":"arxiv_version","alias_value":"2601.20309v2","created_at":"2026-05-20T01:05:07.088071+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.20309","created_at":"2026-05-20T01:05:07.088071+00:00"},{"alias_kind":"pith_short_12","alias_value":"3S7CATW2EZ53","created_at":"2026-05-20T01:05:07.088071+00:00"},{"alias_kind":"pith_short_16","alias_value":"3S7CATW2EZ53OLE3","created_at":"2026-05-20T01:05:07.088071+00:00"},{"alias_kind":"pith_short_8","alias_value":"3S7CATW2","created_at":"2026-05-20T01:05:07.088071+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.16867","citing_title":"GoodServe: Towards High-Goodput Serving of Agentic LLM Inferences over Heterogeneous Resources","ref_index":37,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/3S7CATW2EZ53OLE3NSAIJTRCQH","json":"https://pith.science/pith/3S7CATW2EZ53OLE3NSAIJTRCQH.json","graph_json":"https://pith.science/api/pith-number/3S7CATW2EZ53OLE3NSAIJTRCQH/graph.json","events_json":"https://pith.science/api/pith-number/3S7CATW2EZ53OLE3NSAIJTRCQH/events.json","paper":"https://pith.science/paper/3S7CATW2"},"agent_actions":{"view_html":"https://pith.science/pith/3S7CATW2EZ53OLE3NSAIJTRCQH","download_json":"https://pith.science/pith/3S7CATW2EZ53OLE3NSAIJTRCQH.json","view_paper":"https://pith.science/paper/3S7CATW2","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2601.20309&json=true","fetch_graph":"https://pith.science/api/pith-number/3S7CATW2EZ53OLE3NSAIJTRCQH/graph.json","fetch_events":"https://pith.science/api/pith-number/3S7CATW2EZ53OLE3NSAIJTRCQH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/3S7CATW2EZ53OLE3NSAIJTRCQH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/3S7CATW2EZ53OLE3NSAIJTRCQH/action/storage_attestation","attest_author":"https://pith.science/pith/3S7CATW2EZ53OLE3NSAIJTRCQH/action/author_attestation","sign_citation":"https://pith.science/pith/3S7CATW2EZ53OLE3NSAIJTRCQH/action/citation_signature","submit_replication":"https://pith.science/pith/3S7CATW2EZ53OLE3NSAIJTRCQH/action/replication_record"}},"created_at":"2026-05-20T01:05:07.088071+00:00","updated_at":"2026-05-20T01:05:07.088071+00:00"}