{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:R5I3O3BKJRI5W7L5DDVIWJLSCQ","short_pith_number":"pith:R5I3O3BK","canonical_record":{"source":{"id":"2305.05920","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-05-10T06:17:50Z","cross_cats_sorted":["cs.DC"],"title_canon_sha256":"11d47b641c181c272ea0ee2eff1e59d151b2e50a675352094028c329f8712803","abstract_canon_sha256":"78cdf56d7fa54d739f556b8432c47f660915962d0d52e492c2ac1e70b807618a"},"schema_version":"1.0"},"canonical_sha256":"8f51b76c2a4c51db7d7d18ea8b25721415869cc95e7906d90d5fba833ac4d882","source":{"kind":"arxiv","id":"2305.05920","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2305.05920","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2305.05920v3","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2305.05920","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"R5I3O3BKJRI5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"R5I3O3BKJRI5W7L5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"R5I3O3BK","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:R5I3O3BKJRI5W7L5DDVIWJLSCQ","target":"record","payload":{"canonical_record":{"source":{"id":"2305.05920","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-05-10T06:17:50Z","cross_cats_sorted":["cs.DC"],"title_canon_sha256":"11d47b641c181c272ea0ee2eff1e59d151b2e50a675352094028c329f8712803","abstract_canon_sha256":"78cdf56d7fa54d739f556b8432c47f660915962d0d52e492c2ac1e70b807618a"},"schema_version":"1.0"},"canonical_sha256":"8f51b76c2a4c51db7d7d18ea8b25721415869cc95e7906d90d5fba833ac4d882","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:14.250276Z","signature_b64":"Wb3Q11wq+G1cSbuW2tZXGn+Y43OnnNqDxorg7DSmWCe9Uapm2FC57gCaMg+7iXqpW54u1MMZwp45VnRtSY3tDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8f51b76c2a4c51db7d7d18ea8b25721415869cc95e7906d90d5fba833ac4d882","last_reissued_at":"2026-05-17T23:38:14.249636Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:14.249636Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2305.05920","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"U5q4VTgdkch4OgOBpC9frSVyGtW05MoMgfSIaAhJLmzFKlTVQi1qsxr9UPLPc6IZtXX9TrdR4cfiJbMCb1TTCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-23T17:59:50.573616Z"},"content_sha256":"7982f0552240b64a0aa9253701557cec1107156ad7f2b67b2505375c56327cc8","schema_version":"1.0","event_id":"sha256:7982f0552240b64a0aa9253701557cec1107156ad7f2b67b2505375c56327cc8"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:R5I3O3BKJRI5W7L5DDVIWJLSCQ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Fast Distributed Inference Serving for Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"FastServe enables token-level preemption and skip-join scheduling for LLM inference to raise throughput while holding latency fixed.","cross_cats":["cs.DC"],"primary_cat":"cs.LG","authors_text":"Bingyang Wu, Fangyue Liu, Gang Huang, Shengyu Liu, Xin Jin, Xuanzhe Liu, Yinmin Zhong, Yuanhang Sun, Zili Zhang","submitted_at":"2023-05-10T06:17:50Z","abstract_excerpt":"Large language models (LLMs) power a new generation of interactive AI applications exemplified by ChatGPT. The interactive nature of these applications demands low latency for LLM inference. Existing LLM serving systems use run-to-completion processing for inference jobs, which suffers from head-of-line blocking and long latency.\n  We present FastServe, a distributed inference serving system for LLMs. FastServe exploits the autoregressive pattern of LLM inference to enable preemption at the granularity of each output token. FastServe uses preemptive scheduling to minimize latency with a novel "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"experimental results show that compared to the state-of-the-art solution vLLM, FastServe improves the throughput by up to 31.4x and 17.9x under the same average and tail latency requirements, respectively.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That token-level preemption and the skip-join MLFQ assignment based on input length incur low enough overhead to deliver the reported gains without hidden costs in real workloads.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"FastServe adds token-level preemption and a skip-join MLFQ scheduler to LLM serving, delivering up to 31.4x higher throughput than vLLM at equivalent average and tail latency.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"FastServe enables token-level preemption and skip-join scheduling for LLM inference to raise throughput while holding latency fixed.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"4feae767686b215ed42f1ea0dfcbef5ca22cb18e9277d8dea078c63a91932402"},"source":{"id":"2305.05920","kind":"arxiv","version":3},"verdict":{"id":"9a642a36-4920-4c52-8bb8-f59482e58f7c","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T11:17:21.588710Z","strongest_claim":"experimental results show that compared to the state-of-the-art solution vLLM, FastServe improves the throughput by up to 31.4x and 17.9x under the same average and tail latency requirements, respectively.","one_line_summary":"FastServe adds token-level preemption and a skip-join MLFQ scheduler to LLM serving, delivering up to 31.4x higher throughput than vLLM at equivalent average and tail latency.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That token-level preemption and the skip-join MLFQ assignment based on input length incur low enough overhead to deliver the reported gains without hidden costs in real workloads.","pith_extraction_headline":"FastServe enables token-level preemption and skip-join scheduling for LLM inference to raise throughput while holding latency fixed."},"references":{"count":59,"sample":[{"doi":"","year":2022,"title":"Introducing ChatGPT","work_id":"a5efa48c-9007-406c-b6d2-be938cb1c3ff","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"ChatGPT sets record for fastest-growing user base","work_id":"cedfe6dd-223f-4e96-8a8e-f07f315ad0b5","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Reinventing search with a new ai-powered bing and edge, your copilot for the web","work_id":"25bd9eda-e0dd-4674-96c1-16b8c7daabd2","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Our next-generation model: Gemini 1.5","work_id":"9cb11add-3054-4d6d-aa1c-8f99f011a10a","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Introducing the next generation of Claude","work_id":"7c7d9cf6-059c-4acc-95c5-cd733b780dee","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":59,"snapshot_sha256":"2f0e5cba6fd1e55d264b3b89675f2fb3aef7066f184db32442194bfa57886bdd","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"9a642a36-4920-4c52-8bb8-f59482e58f7c"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"oE7W9C9RNOJ5rOu2dd777Zm304qYgS9dBsRQtFLrfpzFYdgWUOycF1dXMCuO6f+8sTVIj7b3+IrK/JIseMzqCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-23T17:59:50.574601Z"},"content_sha256":"b2ad981beed688845f2fd8080518fd06759a169ba4483a602b73811d84d9fba6","schema_version":"1.0","event_id":"sha256:b2ad981beed688845f2fd8080518fd06759a169ba4483a602b73811d84d9fba6"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/R5I3O3BKJRI5W7L5DDVIWJLSCQ/bundle.json","state_url":"https://pith.science/pith/R5I3O3BKJRI5W7L5DDVIWJLSCQ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/R5I3O3BKJRI5W7L5DDVIWJLSCQ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-23T17:59:50Z","links":{"resolver":"https://pith.science/pith/R5I3O3BKJRI5W7L5DDVIWJLSCQ","bundle":"https://pith.science/pith/R5I3O3BKJRI5W7L5DDVIWJLSCQ/bundle.json","state":"https://pith.science/pith/R5I3O3BKJRI5W7L5DDVIWJLSCQ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/R5I3O3BKJRI5W7L5DDVIWJLSCQ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:R5I3O3BKJRI5W7L5DDVIWJLSCQ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"78cdf56d7fa54d739f556b8432c47f660915962d0d52e492c2ac1e70b807618a","cross_cats_sorted":["cs.DC"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-05-10T06:17:50Z","title_canon_sha256":"11d47b641c181c272ea0ee2eff1e59d151b2e50a675352094028c329f8712803"},"schema_version":"1.0","source":{"id":"2305.05920","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2305.05920","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2305.05920v3","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2305.05920","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"R5I3O3BKJRI5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"R5I3O3BKJRI5W7L5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"R5I3O3BK","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:b2ad981beed688845f2fd8080518fd06759a169ba4483a602b73811d84d9fba6","target":"graph","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"experimental results show that compared to the state-of-the-art solution vLLM, FastServe improves the throughput by up to 31.4x and 17.9x under the same average and tail latency requirements, respectively."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That token-level preemption and the skip-join MLFQ assignment based on input length incur low enough overhead to deliver the reported gains without hidden costs in real workloads."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"FastServe adds token-level preemption and a skip-join MLFQ scheduler to LLM serving, delivering up to 31.4x higher throughput than vLLM at equivalent average and tail latency."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"FastServe enables token-level preemption and skip-join scheduling for LLM inference to raise throughput while holding latency fixed."}],"snapshot_sha256":"4feae767686b215ed42f1ea0dfcbef5ca22cb18e9277d8dea078c63a91932402"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Large language models (LLMs) power a new generation of interactive AI applications exemplified by ChatGPT. The interactive nature of these applications demands low latency for LLM inference. Existing LLM serving systems use run-to-completion processing for inference jobs, which suffers from head-of-line blocking and long latency.\n  We present FastServe, a distributed inference serving system for LLMs. FastServe exploits the autoregressive pattern of LLM inference to enable preemption at the granularity of each output token. FastServe uses preemptive scheduling to minimize latency with a novel ","authors_text":"Bingyang Wu, Fangyue Liu, Gang Huang, Shengyu Liu, Xin Jin, Xuanzhe Liu, Yinmin Zhong, Yuanhang Sun, Zili Zhang","cross_cats":["cs.DC"],"headline":"FastServe enables token-level preemption and skip-join scheduling for LLM inference to raise throughput while holding latency fixed.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-05-10T06:17:50Z","title":"Fast Distributed Inference Serving for Large Language Models"},"references":{"count":59,"internal_anchors":0,"resolved_work":59,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Introducing ChatGPT","work_id":"a5efa48c-9007-406c-b6d2-be938cb1c3ff","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"ChatGPT sets record for fastest-growing user base","work_id":"cedfe6dd-223f-4e96-8a8e-f07f315ad0b5","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Reinventing search with a new ai-powered bing and edge, your copilot for the web","work_id":"25bd9eda-e0dd-4674-96c1-16b8c7daabd2","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Our next-generation model: Gemini 1.5","work_id":"9cb11add-3054-4d6d-aa1c-8f99f011a10a","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Introducing the next generation of Claude","work_id":"7c7d9cf6-059c-4acc-95c5-cd733b780dee","year":2024}],"snapshot_sha256":"2f0e5cba6fd1e55d264b3b89675f2fb3aef7066f184db32442194bfa57886bdd"},"source":{"id":"2305.05920","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-17T11:17:21.588710Z","id":"9a642a36-4920-4c52-8bb8-f59482e58f7c","model_set":{"reader":"grok-4.3"},"one_line_summary":"FastServe adds token-level preemption and a skip-join MLFQ scheduler to LLM serving, delivering up to 31.4x higher throughput than vLLM at equivalent average and tail latency.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"FastServe enables token-level preemption and skip-join scheduling for LLM inference to raise throughput while holding latency fixed.","strongest_claim":"experimental results show that compared to the state-of-the-art solution vLLM, FastServe improves the throughput by up to 31.4x and 17.9x under the same average and tail latency requirements, respectively.","weakest_assumption":"That token-level preemption and the skip-join MLFQ assignment based on input length incur low enough overhead to deliver the reported gains without hidden costs in real workloads."}},"verdict_id":"9a642a36-4920-4c52-8bb8-f59482e58f7c"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:7982f0552240b64a0aa9253701557cec1107156ad7f2b67b2505375c56327cc8","target":"record","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"78cdf56d7fa54d739f556b8432c47f660915962d0d52e492c2ac1e70b807618a","cross_cats_sorted":["cs.DC"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-05-10T06:17:50Z","title_canon_sha256":"11d47b641c181c272ea0ee2eff1e59d151b2e50a675352094028c329f8712803"},"schema_version":"1.0","source":{"id":"2305.05920","kind":"arxiv","version":3}},"canonical_sha256":"8f51b76c2a4c51db7d7d18ea8b25721415869cc95e7906d90d5fba833ac4d882","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"8f51b76c2a4c51db7d7d18ea8b25721415869cc95e7906d90d5fba833ac4d882","first_computed_at":"2026-05-17T23:38:14.249636Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:14.249636Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"Wb3Q11wq+G1cSbuW2tZXGn+Y43OnnNqDxorg7DSmWCe9Uapm2FC57gCaMg+7iXqpW54u1MMZwp45VnRtSY3tDw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:14.250276Z","signed_message":"canonical_sha256_bytes"},"source_id":"2305.05920","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:7982f0552240b64a0aa9253701557cec1107156ad7f2b67b2505375c56327cc8","sha256:b2ad981beed688845f2fd8080518fd06759a169ba4483a602b73811d84d9fba6"],"state_sha256":"31fe89cfd247285bbdb92a693ef0fa33c08fa698cbc357132aee249949a0f95b"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"0iTiERm1f8+g2/Sz1AdPGaeaHhfP+Mqmz7Z11VGF0d4Z6JB7prLt59mhVALq+neY/ge8BXOjT+Pt1liZ0LlcDw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-23T17:59:50.578712Z","bundle_sha256":"a4a9d48a0544ff21ce2d5300b352639c5c41f333fd4286379cef6a8584a6f7e1"}}