{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:LK3AAFPBRF6FDZR2UNQ677ROMZ","short_pith_number":"pith:LK3AAFPB","canonical_record":{"source":{"id":"2308.16369","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-08-31T00:03:02Z","cross_cats_sorted":["cs.DC"],"title_canon_sha256":"52013df046821f6fa51ac4289806767fcc03790d923841a9e0b1f85213776b67","abstract_canon_sha256":"466bf7c6ea41511e785a758ea569c17066f4dacb24704504de9573d6d0ed8b1e"},"schema_version":"1.0"},"canonical_sha256":"5ab60015e1897c51e63aa361effe2e666a536eca4b29df28eb889ec3d70dd7a7","source":{"kind":"arxiv","id":"2308.16369","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2308.16369","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2308.16369v1","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2308.16369","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"LK3AAFPBRF6F","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"LK3AAFPBRF6FDZR2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"LK3AAFPB","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:LK3AAFPBRF6FDZR2UNQ677ROMZ","target":"record","payload":{"canonical_record":{"source":{"id":"2308.16369","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-08-31T00:03:02Z","cross_cats_sorted":["cs.DC"],"title_canon_sha256":"52013df046821f6fa51ac4289806767fcc03790d923841a9e0b1f85213776b67","abstract_canon_sha256":"466bf7c6ea41511e785a758ea569c17066f4dacb24704504de9573d6d0ed8b1e"},"schema_version":"1.0"},"canonical_sha256":"5ab60015e1897c51e63aa361effe2e666a536eca4b29df28eb889ec3d70dd7a7","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:48.830683Z","signature_b64":"GtdnUEEidREP3/lpqNFBYXy1hcX5WJydt9VeMIXDW28iT4pgSRzbJUcAkrLr8SB9w+31VMfUKMBItm4/mG5JAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5ab60015e1897c51e63aa361effe2e666a536eca4b29df28eb889ec3d70dd7a7","last_reissued_at":"2026-05-17T23:38:48.830035Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:48.830035Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2308.16369","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"3h5/SX0hnyKQvAqc6YJbDZLlZg1l1qvhcPhNZR150fYQ0PDPWqU69wV1bLIyFhnI+GAEfhxsB63eWN1urscFBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T15:40:14.621329Z"},"content_sha256":"2e6861412a5e11e67f62bf1007571176132ad5c7af8bf9f00bfdc2994501ca28","schema_version":"1.0","event_id":"sha256:2e6861412a5e11e67f62bf1007571176132ad5c7af8bf9f00bfdc2994501ca28"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:LK3AAFPBRF6FDZR2UNQ677ROMZ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"SARATHI splits each prefill into equal chunks and fills the rest of every batch with decode requests so the chunks saturate GPU compute while decodes piggyback at far lower cost.","cross_cats":["cs.DC"],"primary_cat":"cs.LG","authors_text":"Amey Agrawal, Ashish Panwar, Bhargav S. Gulavani, Jayashree Mohan, Nipun Kwatra, Ramachandran Ramjee","submitted_at":"2023-08-31T00:03:02Z","abstract_excerpt":"Large Language Model (LLM) inference consists of two distinct phases - prefill phase which processes the input prompt and decode phase which generates output tokens autoregressively. While the prefill phase effectively saturates GPU compute at small batch sizes, the decode phase results in low compute utilization as it generates one token at a time per request. The varying prefill and decode times also lead to imbalance across micro-batches when using pipeline parallelism, resulting in further inefficiency due to bubbles.\n  We present SARATHI to address these challenges. SARATHI employs chunke"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"For the LLaMA-13B model on A6000 GPU, SARATHI improves decode throughput by up to 10x, and accelerates end-to-end throughput by up to 1.33x. When used with pipeline parallelism on GPT-3, SARATHI reduces bubbles by 6.29x, resulting in an end-to-end throughput improvement of 1.91x.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That chunked prefills can be performed without accuracy loss or extra memory overhead and that decode requests can be freely mixed into the same batch as a prefill chunk while preserving correct autoregressive generation.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SARATHI uses chunked prefills and decode-maximal batching to let decode steps ride along with prefill compute, delivering up to 10x higher decode throughput and 1.91x end-to-end throughput on models including LLaMA-13B and GPT-3.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"SARATHI splits each prefill into equal chunks and fills the rest of every batch with decode requests so the chunks saturate GPU compute while decodes piggyback at far lower cost.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"4295394de828f488a885f09e57676e686064d461654e320359bc358952e4f7d3"},"source":{"id":"2308.16369","kind":"arxiv","version":1},"verdict":{"id":"43b09fc4-e42c-44c2-948b-45d39053332d","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T06:27:19.762524Z","strongest_claim":"For the LLaMA-13B model on A6000 GPU, SARATHI improves decode throughput by up to 10x, and accelerates end-to-end throughput by up to 1.33x. When used with pipeline parallelism on GPT-3, SARATHI reduces bubbles by 6.29x, resulting in an end-to-end throughput improvement of 1.91x.","one_line_summary":"SARATHI uses chunked prefills and decode-maximal batching to let decode steps ride along with prefill compute, delivering up to 10x higher decode throughput and 1.91x end-to-end throughput on models including LLaMA-13B and GPT-3.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That chunked prefills can be performed without accuracy loss or extra memory overhead and that decode requests can be freely mixed into the same batch as a prefill chunk while preserving correct autoregressive generation.","pith_extraction_headline":"SARATHI splits each prefill into equal chunks and fills the rest of every batch with decode requests so the chunks saturate GPU compute while decodes piggyback at far lower cost."},"references":{"count":48,"sample":[{"doi":"","year":null,"title":"https://aws.amazon.com/ codewhisperer/","work_id":"8ceb9062-1081-4032-823d-82de237e4f51","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"https://claude.ai","work_id":"f42d5c73-87c4-4047-8114-d692921e1e62","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"https://www.bing.com/chat","work_id":"9b6158d3-54e7-4e29-bd55-9fd55925290c","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"https://character.ai","work_id":"53e8cd23-a2da-4851-ba9c-d27d179df274","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"https://chat.openai.com","work_id":"1d52047a-4bbb-4d45-8130-15e3ce4a1d05","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":48,"snapshot_sha256":"31aa9d4c1bfd7cb41b173b3bfe228e77ee0333d8349cb9ea21f33e46477b5305","internal_anchors":4},"formal_canon":{"evidence_count":2,"snapshot_sha256":"c51d576addadf81f81c2c088dc4ed515d4dfdb28872d193ba34273b4bf8ca988"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"43b09fc4-e42c-44c2-948b-45d39053332d"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cWWp4nYUEyL35a+jjuWGjgV8wv6rfINRp8naaqbm13oHZd1/3zbuYIP1Z8HlDx/1dx+FkwY/CDpfEZpkhzb9DQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T15:40:14.622444Z"},"content_sha256":"2e66f56c689c48047016835e4f9a3608d33dfc95bacea925c0f1fafbfccd5c62","schema_version":"1.0","event_id":"sha256:2e66f56c689c48047016835e4f9a3608d33dfc95bacea925c0f1fafbfccd5c62"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/LK3AAFPBRF6FDZR2UNQ677ROMZ/bundle.json","state_url":"https://pith.science/pith/LK3AAFPBRF6FDZR2UNQ677ROMZ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/LK3AAFPBRF6FDZR2UNQ677ROMZ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-24T15:40:14Z","links":{"resolver":"https://pith.science/pith/LK3AAFPBRF6FDZR2UNQ677ROMZ","bundle":"https://pith.science/pith/LK3AAFPBRF6FDZR2UNQ677ROMZ/bundle.json","state":"https://pith.science/pith/LK3AAFPBRF6FDZR2UNQ677ROMZ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/LK3AAFPBRF6FDZR2UNQ677ROMZ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:LK3AAFPBRF6FDZR2UNQ677ROMZ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"466bf7c6ea41511e785a758ea569c17066f4dacb24704504de9573d6d0ed8b1e","cross_cats_sorted":["cs.DC"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-08-31T00:03:02Z","title_canon_sha256":"52013df046821f6fa51ac4289806767fcc03790d923841a9e0b1f85213776b67"},"schema_version":"1.0","source":{"id":"2308.16369","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2308.16369","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2308.16369v1","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2308.16369","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"LK3AAFPBRF6F","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"LK3AAFPBRF6FDZR2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"LK3AAFPB","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:2e66f56c689c48047016835e4f9a3608d33dfc95bacea925c0f1fafbfccd5c62","target":"graph","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"For the LLaMA-13B model on A6000 GPU, SARATHI improves decode throughput by up to 10x, and accelerates end-to-end throughput by up to 1.33x. When used with pipeline parallelism on GPT-3, SARATHI reduces bubbles by 6.29x, resulting in an end-to-end throughput improvement of 1.91x."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That chunked prefills can be performed without accuracy loss or extra memory overhead and that decode requests can be freely mixed into the same batch as a prefill chunk while preserving correct autoregressive generation."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SARATHI uses chunked prefills and decode-maximal batching to let decode steps ride along with prefill compute, delivering up to 10x higher decode throughput and 1.91x end-to-end throughput on models including LLaMA-13B and GPT-3."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"SARATHI splits each prefill into equal chunks and fills the rest of every batch with decode requests so the chunks saturate GPU compute while decodes piggyback at far lower cost."}],"snapshot_sha256":"4295394de828f488a885f09e57676e686064d461654e320359bc358952e4f7d3"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"c51d576addadf81f81c2c088dc4ed515d4dfdb28872d193ba34273b4bf8ca988"},"paper":{"abstract_excerpt":"Large Language Model (LLM) inference consists of two distinct phases - prefill phase which processes the input prompt and decode phase which generates output tokens autoregressively. While the prefill phase effectively saturates GPU compute at small batch sizes, the decode phase results in low compute utilization as it generates one token at a time per request. The varying prefill and decode times also lead to imbalance across micro-batches when using pipeline parallelism, resulting in further inefficiency due to bubbles.\n  We present SARATHI to address these challenges. SARATHI employs chunke","authors_text":"Amey Agrawal, Ashish Panwar, Bhargav S. Gulavani, Jayashree Mohan, Nipun Kwatra, Ramachandran Ramjee","cross_cats":["cs.DC"],"headline":"SARATHI splits each prefill into equal chunks and fills the rest of every batch with decode requests so the chunks saturate GPU compute while decodes piggyback at far lower cost.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-08-31T00:03:02Z","title":"SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills"},"references":{"count":48,"internal_anchors":4,"resolved_work":48,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"https://aws.amazon.com/ codewhisperer/","work_id":"8ceb9062-1081-4032-823d-82de237e4f51","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"https://claude.ai","work_id":"f42d5c73-87c4-4047-8114-d692921e1e62","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"https://www.bing.com/chat","work_id":"9b6158d3-54e7-4e29-bd55-9fd55925290c","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"https://character.ai","work_id":"53e8cd23-a2da-4851-ba9c-d27d179df274","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"https://chat.openai.com","work_id":"1d52047a-4bbb-4d45-8130-15e3ce4a1d05","year":null}],"snapshot_sha256":"31aa9d4c1bfd7cb41b173b3bfe228e77ee0333d8349cb9ea21f33e46477b5305"},"source":{"id":"2308.16369","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-16T06:27:19.762524Z","id":"43b09fc4-e42c-44c2-948b-45d39053332d","model_set":{"reader":"grok-4.3"},"one_line_summary":"SARATHI uses chunked prefills and decode-maximal batching to let decode steps ride along with prefill compute, delivering up to 10x higher decode throughput and 1.91x end-to-end throughput on models including LLaMA-13B and GPT-3.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"SARATHI splits each prefill into equal chunks and fills the rest of every batch with decode requests so the chunks saturate GPU compute while decodes piggyback at far lower cost.","strongest_claim":"For the LLaMA-13B model on A6000 GPU, SARATHI improves decode throughput by up to 10x, and accelerates end-to-end throughput by up to 1.33x. When used with pipeline parallelism on GPT-3, SARATHI reduces bubbles by 6.29x, resulting in an end-to-end throughput improvement of 1.91x.","weakest_assumption":"That chunked prefills can be performed without accuracy loss or extra memory overhead and that decode requests can be freely mixed into the same batch as a prefill chunk while preserving correct autoregressive generation."}},"verdict_id":"43b09fc4-e42c-44c2-948b-45d39053332d"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2e6861412a5e11e67f62bf1007571176132ad5c7af8bf9f00bfdc2994501ca28","target":"record","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"466bf7c6ea41511e785a758ea569c17066f4dacb24704504de9573d6d0ed8b1e","cross_cats_sorted":["cs.DC"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-08-31T00:03:02Z","title_canon_sha256":"52013df046821f6fa51ac4289806767fcc03790d923841a9e0b1f85213776b67"},"schema_version":"1.0","source":{"id":"2308.16369","kind":"arxiv","version":1}},"canonical_sha256":"5ab60015e1897c51e63aa361effe2e666a536eca4b29df28eb889ec3d70dd7a7","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"5ab60015e1897c51e63aa361effe2e666a536eca4b29df28eb889ec3d70dd7a7","first_computed_at":"2026-05-17T23:38:48.830035Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:48.830035Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"GtdnUEEidREP3/lpqNFBYXy1hcX5WJydt9VeMIXDW28iT4pgSRzbJUcAkrLr8SB9w+31VMfUKMBItm4/mG5JAg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:48.830683Z","signed_message":"canonical_sha256_bytes"},"source_id":"2308.16369","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2e6861412a5e11e67f62bf1007571176132ad5c7af8bf9f00bfdc2994501ca28","sha256:2e66f56c689c48047016835e4f9a3608d33dfc95bacea925c0f1fafbfccd5c62"],"state_sha256":"b32d9727685ac23dfa9e437f7cb693e9450ca62c5a5a8cf2101115d18fb78cc3"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ercqWzcmCDWT9tMc2FCjYNsqvqio1ZS8E6mEan9lZ5y6cIhXL3Kps40z5L4Aq1NcBEroaDgNgb97r0RsTM8oAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-24T15:40:14.628252Z","bundle_sha256":"0065f7f14eaa5b47f14e8c8997124cc8c5c992de7e84ec018756583f87d7642e"}}