{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:PFQ4IG73A4U5ZLJPCG76ZK7MIQ","short_pith_number":"pith:PFQ4IG73","canonical_record":{"source":{"id":"2605.14217","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T00:19:41Z","cross_cats_sorted":["cs.AI","cs.CL","cs.SY","eess.SY"],"title_canon_sha256":"f6fbe28f555b9ae084ed42d96daebe9c88f492ec8dabe5a81d3ffc062b157bb5","abstract_canon_sha256":"ae94a8a8261dba87cb2891b1f9d169d168d2be8c1ae311ed59ec1cceb1113bef"},"schema_version":"1.0"},"canonical_sha256":"7961c41bfb0729dcad2f11bfecabec44385f09e18554f02d884e88f96772842a","source":{"kind":"arxiv","id":"2605.14217","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14217","created_at":"2026-05-17T23:39:10Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14217v1","created_at":"2026-05-17T23:39:10Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14217","created_at":"2026-05-17T23:39:10Z"},{"alias_kind":"pith_short_12","alias_value":"PFQ4IG73A4U5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"PFQ4IG73A4U5ZLJP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"PFQ4IG73","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:PFQ4IG73A4U5ZLJPCG76ZK7MIQ","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14217","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T00:19:41Z","cross_cats_sorted":["cs.AI","cs.CL","cs.SY","eess.SY"],"title_canon_sha256":"f6fbe28f555b9ae084ed42d96daebe9c88f492ec8dabe5a81d3ffc062b157bb5","abstract_canon_sha256":"ae94a8a8261dba87cb2891b1f9d169d168d2be8c1ae311ed59ec1cceb1113bef"},"schema_version":"1.0"},"canonical_sha256":"7961c41bfb0729dcad2f11bfecabec44385f09e18554f02d884e88f96772842a","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:10.868497Z","signature_b64":"tUhrrfAn9XvCzd1e4IodQlxwjn/kkkROFljEreVSsl3KZpLIcMwJdhk5QwuBhj1Mh2FXyvvj2dEjNblxT0fABg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7961c41bfb0729dcad2f11bfecabec44385f09e18554f02d884e88f96772842a","last_reissued_at":"2026-05-17T23:39:10.867464Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:10.867464Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14217","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:10Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"JpvnsRiJaGJ8gwEkwDpIN1mPDct9S+OaVi1eg1GDwXawE1PXjizQ9DRpF4dVNTUhuef2ySo0BYf0ODmTBXNSBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T16:32:26.626231Z"},"content_sha256":"95f4ed1637faa8cf65819e7b976a031b28acc9c360bed89cf16e078b39414a8a","schema_version":"1.0","event_id":"sha256:95f4ed1637faa8cf65819e7b976a031b28acc9c360bed89cf16e078b39414a8a"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:PFQ4IG73A4U5ZLJPCG76ZK7MIQ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"PreFT: Prefill-only finetuning for efficient inference","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Applying adapters only during prefill and discarding them afterward raises serving throughput nearly twofold while keeping performance near standard PEFT levels.","cross_cats":["cs.AI","cs.CL","cs.SY","eess.SY"],"primary_cat":"cs.LG","authors_text":"Andrew Lanpouthakoun, Aryaman Arora, Ben Keigwin, Christopher Potts, Dan Jurafsky, Dhruv Pai, Zhengxuan Wu","submitted_at":"2026-05-14T00:19:41Z","abstract_excerpt":"Large language models can now be personalised efficiently at scale using parameter efficient finetuning methods (PEFTs), but serving user-specific PEFTs harms throughput, even with specialised kernels and memory management techniques. This is because, theoretically and empirically, a mismatch exists between prefill (processing a large number of tokens at once) and decode (generating a single token autoregressively): the latter has far lower throughput when serving multiple adapters. Rather than optimising performance relative to parameter count, for efficient multi-adapter serving, we instead "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"serving multi-user PreFTs is more efficient than traditional PEFTs (1.9× the throughput when serving 512 adapters on Llama 3.1 70B). On RL tasks PreFTs approach parity with standard PEFTs.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That discarding the adapter after prefill does not materially degrade the quality of the generated tokens on downstream tasks, and that any loss can be offset by increasing adapter rank without throughput cost.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Prefill-only adaptation of LLMs yields 1.9x higher throughput for 512 adapters on Llama 3.1 70B with near-parity performance on RL tasks and recoverable loss on SFT.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Applying adapters only during prefill and discarding them afterward raises serving throughput nearly twofold while keeping performance near standard PEFT levels.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"7987dbc8cc7b75f1a838c8a4e6568b68c75332682a108d84c3412368598fefcf"},"source":{"id":"2605.14217","kind":"arxiv","version":1},"verdict":{"id":"9f392c39-3f91-4424-a563-12578a7818c6","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T02:06:32.961245Z","strongest_claim":"serving multi-user PreFTs is more efficient than traditional PEFTs (1.9× the throughput when serving 512 adapters on Llama 3.1 70B). On RL tasks PreFTs approach parity with standard PEFTs.","one_line_summary":"Prefill-only adaptation of LLMs yields 1.9x higher throughput for 512 adapters on Llama 3.1 70B with near-parity performance on RL tasks and recoverable loss on SFT.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That discarding the adapter after prefill does not materially degrade the quality of the generated tokens on downstream tasks, and that any loss can be offset by increasing adapter rank without throughput cost.","pith_extraction_headline":"Applying adapters only during prefill and discarding them afterward raises serving throughput nearly twofold while keeping performance near standard PEFT levels."},"references":{"count":46,"sample":[{"doi":"","year":2024,"title":"On-policy distillation of language models: Learning from self-generated mistakes","work_id":"010d5562-cfba-4601-991d-c76f3e8d51b1","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","ref_index":2,"cited_arxiv_id":"2108.07732","is_internal_anchor":true},{"doi":"","year":2025,"title":"How to Scale Your Model","work_id":"3dc2596f-4ac5-4e87-90cf-f6a0ea7decc8","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"2408.07055 , archiveprefix =","work_id":"7b17f534-444c-40c9-8c05-ead22c892088","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.3233/faia251185","year":2025,"title":"LoRA-XS : Low-rank adaptation with extremely small number of parameters","work_id":"59a9e976-36c1-4db5-8eed-bf7d79f5664e","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":46,"snapshot_sha256":"b5cb3c5b644be6dad61a699f3accd7cead1385ff97a45b495fe677c7738dc106","internal_anchors":10},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"9f392c39-3f91-4424-a563-12578a7818c6"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:10Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Q6vdTmTtA3LV0p7FC4vobShesgmEbkcOWhvntVNkDAQe2staC9zOxPEoIISSzpWLm4VTcP6iiM+wVTxlGaBoDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T16:32:26.627744Z"},"content_sha256":"cac42928e2d93135980017f4e0e1b2010dff44658d9417a37546d55d74b2c0f8","schema_version":"1.0","event_id":"sha256:cac42928e2d93135980017f4e0e1b2010dff44658d9417a37546d55d74b2c0f8"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/PFQ4IG73A4U5ZLJPCG76ZK7MIQ/bundle.json","state_url":"https://pith.science/pith/PFQ4IG73A4U5ZLJPCG76ZK7MIQ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/PFQ4IG73A4U5ZLJPCG76ZK7MIQ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T16:32:26Z","links":{"resolver":"https://pith.science/pith/PFQ4IG73A4U5ZLJPCG76ZK7MIQ","bundle":"https://pith.science/pith/PFQ4IG73A4U5ZLJPCG76ZK7MIQ/bundle.json","state":"https://pith.science/pith/PFQ4IG73A4U5ZLJPCG76ZK7MIQ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/PFQ4IG73A4U5ZLJPCG76ZK7MIQ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:PFQ4IG73A4U5ZLJPCG76ZK7MIQ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"ae94a8a8261dba87cb2891b1f9d169d168d2be8c1ae311ed59ec1cceb1113bef","cross_cats_sorted":["cs.AI","cs.CL","cs.SY","eess.SY"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T00:19:41Z","title_canon_sha256":"f6fbe28f555b9ae084ed42d96daebe9c88f492ec8dabe5a81d3ffc062b157bb5"},"schema_version":"1.0","source":{"id":"2605.14217","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14217","created_at":"2026-05-17T23:39:10Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14217v1","created_at":"2026-05-17T23:39:10Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14217","created_at":"2026-05-17T23:39:10Z"},{"alias_kind":"pith_short_12","alias_value":"PFQ4IG73A4U5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"PFQ4IG73A4U5ZLJP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"PFQ4IG73","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:cac42928e2d93135980017f4e0e1b2010dff44658d9417a37546d55d74b2c0f8","target":"graph","created_at":"2026-05-17T23:39:10Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"serving multi-user PreFTs is more efficient than traditional PEFTs (1.9× the throughput when serving 512 adapters on Llama 3.1 70B). On RL tasks PreFTs approach parity with standard PEFTs."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That discarding the adapter after prefill does not materially degrade the quality of the generated tokens on downstream tasks, and that any loss can be offset by increasing adapter rank without throughput cost."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Prefill-only adaptation of LLMs yields 1.9x higher throughput for 512 adapters on Llama 3.1 70B with near-parity performance on RL tasks and recoverable loss on SFT."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Applying adapters only during prefill and discarding them afterward raises serving throughput nearly twofold while keeping performance near standard PEFT levels."}],"snapshot_sha256":"7987dbc8cc7b75f1a838c8a4e6568b68c75332682a108d84c3412368598fefcf"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Large language models can now be personalised efficiently at scale using parameter efficient finetuning methods (PEFTs), but serving user-specific PEFTs harms throughput, even with specialised kernels and memory management techniques. This is because, theoretically and empirically, a mismatch exists between prefill (processing a large number of tokens at once) and decode (generating a single token autoregressively): the latter has far lower throughput when serving multiple adapters. Rather than optimising performance relative to parameter count, for efficient multi-adapter serving, we instead ","authors_text":"Andrew Lanpouthakoun, Aryaman Arora, Ben Keigwin, Christopher Potts, Dan Jurafsky, Dhruv Pai, Zhengxuan Wu","cross_cats":["cs.AI","cs.CL","cs.SY","eess.SY"],"headline":"Applying adapters only during prefill and discarding them afterward raises serving throughput nearly twofold while keeping performance near standard PEFT levels.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T00:19:41Z","title":"PreFT: Prefill-only finetuning for efficient inference"},"references":{"count":46,"internal_anchors":10,"resolved_work":46,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"On-policy distillation of language models: Learning from self-generated mistakes","work_id":"010d5562-cfba-4601-991d-c76f3e8d51b1","year":2024},{"cited_arxiv_id":"2108.07732","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"How to Scale Your Model","work_id":"3dc2596f-4ac5-4e87-90cf-f6a0ea7decc8","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"2408.07055 , archiveprefix =","work_id":"7b17f534-444c-40c9-8c05-ead22c892088","year":2024},{"cited_arxiv_id":"","doi":"10.3233/faia251185","is_internal_anchor":false,"ref_index":5,"title":"LoRA-XS : Low-rank adaptation with extremely small number of parameters","work_id":"59a9e976-36c1-4db5-8eed-bf7d79f5664e","year":2025}],"snapshot_sha256":"b5cb3c5b644be6dad61a699f3accd7cead1385ff97a45b495fe677c7738dc106"},"source":{"id":"2605.14217","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T02:06:32.961245Z","id":"9f392c39-3f91-4424-a563-12578a7818c6","model_set":{"reader":"grok-4.3"},"one_line_summary":"Prefill-only adaptation of LLMs yields 1.9x higher throughput for 512 adapters on Llama 3.1 70B with near-parity performance on RL tasks and recoverable loss on SFT.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Applying adapters only during prefill and discarding them afterward raises serving throughput nearly twofold while keeping performance near standard PEFT levels.","strongest_claim":"serving multi-user PreFTs is more efficient than traditional PEFTs (1.9× the throughput when serving 512 adapters on Llama 3.1 70B). On RL tasks PreFTs approach parity with standard PEFTs.","weakest_assumption":"That discarding the adapter after prefill does not materially degrade the quality of the generated tokens on downstream tasks, and that any loss can be offset by increasing adapter rank without throughput cost."}},"verdict_id":"9f392c39-3f91-4424-a563-12578a7818c6"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:95f4ed1637faa8cf65819e7b976a031b28acc9c360bed89cf16e078b39414a8a","target":"record","created_at":"2026-05-17T23:39:10Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"ae94a8a8261dba87cb2891b1f9d169d168d2be8c1ae311ed59ec1cceb1113bef","cross_cats_sorted":["cs.AI","cs.CL","cs.SY","eess.SY"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T00:19:41Z","title_canon_sha256":"f6fbe28f555b9ae084ed42d96daebe9c88f492ec8dabe5a81d3ffc062b157bb5"},"schema_version":"1.0","source":{"id":"2605.14217","kind":"arxiv","version":1}},"canonical_sha256":"7961c41bfb0729dcad2f11bfecabec44385f09e18554f02d884e88f96772842a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"7961c41bfb0729dcad2f11bfecabec44385f09e18554f02d884e88f96772842a","first_computed_at":"2026-05-17T23:39:10.867464Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:10.867464Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"tUhrrfAn9XvCzd1e4IodQlxwjn/kkkROFljEreVSsl3KZpLIcMwJdhk5QwuBhj1Mh2FXyvvj2dEjNblxT0fABg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:10.868497Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14217","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:95f4ed1637faa8cf65819e7b976a031b28acc9c360bed89cf16e078b39414a8a","sha256:cac42928e2d93135980017f4e0e1b2010dff44658d9417a37546d55d74b2c0f8"],"state_sha256":"7ab3f42e9909210ad983dc90275700af239a3730392a2277d47c40866f3c3555"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"lMXXsmAju8FS4DSbRNhWdZF23JBUoGwMd7o0rPQfp8CC63dA8WRZF1d3Xn9ROOUxyaHJI7r4ncWw3TbHfCX+AQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T16:32:26.633118Z","bundle_sha256":"372905572aa7c443d72220a40da59d75555493a0c0ee38d5a7eaa5a44cd6d8e1"}}