{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:PS6UM5SLUKGNGJYHCWQ563ITYT","short_pith_number":"pith:PS6UM5SL","schema_version":"1.0","canonical_sha256":"7cbd46764ba28cd3270715a1df6d13c4fe90ee21e86c247035dcb8a9e40a675b","source":{"kind":"arxiv","id":"2605.09329","version":2},"attestation_state":"computed","paper":{"title":"Test-Time Speculation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Test-Time Speculation adapts the draft model online using target verification signals to sustain high acceptance lengths during long LLM generations.","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Avinash Kumar, Poulami Das, Sujay Sanghavi","submitted_at":"2026-05-10T05:02:39Z","abstract_excerpt":"Speculative decoding accelerates LLM inference by using a fast draft model to generate tokens and a more accurate target model to verify them. Its performance depends on the $\\textit{acceptance length}$, or number of draft tokens accepted by the target. Our studies show that the acceptance length of even state-of-the-art speculators, like DFlash, EAGLE-3 and PARD degrade with generation length, reaching values close to 1 (i.e. no speedup) within just a few thousand output tokens, making speculators ineffective for long-response tasks. Acceptance lengths decline because most speculators are tra"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.09329","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-10T05:02:39Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"2a9e92011d09426b263c1870a5b573d5aaf6d24912f2beaa4399609a6fefc30a","abstract_canon_sha256":"1bfeebf3f66287e262cabd96664ca9d99e6e8fc212f48e0e771c0d64ba4a3a03"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T02:05:45.517761Z","signature_b64":"K1M7xv4YWCLGWJw5iK9b2rxCeU5eFM6WPZBMb4N3OUkieT5Mm6hmpKU8ChuisZsHN+7fqvB26dA8E9Q9Zua4CA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7cbd46764ba28cd3270715a1df6d13c4fe90ee21e86c247035dcb8a9e40a675b","last_reissued_at":"2026-05-20T02:05:45.516862Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T02:05:45.516862Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Test-Time Speculation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Test-Time Speculation adapts the draft model online using target verification signals to sustain high acceptance lengths during long LLM generations.","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Avinash Kumar, Poulami Das, Sujay Sanghavi","submitted_at":"2026-05-10T05:02:39Z","abstract_excerpt":"Speculative decoding accelerates LLM inference by using a fast draft model to generate tokens and a more accurate target model to verify them. Its performance depends on the $\\textit{acceptance length}$, or number of draft tokens accepted by the target. Our studies show that the acceptance length of even state-of-the-art speculators, like DFlash, EAGLE-3 and PARD degrade with generation length, reaching values close to 1 (i.e. no speedup) within just a few thousand output tokens, making speculators ineffective for long-response tasks. Acceptance lengths decline because most speculators are tra"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"TTS improves acceptance lengths over state-of-the-art speculators by up to 72% and 41% on average, with the benefits scaling with increased generation lengths.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That continuous online updates to the draft model remain stable and do not introduce latency, divergence, or quality degradation over very long generations, and that the verification signals provide sufficient training data without distribution shift issues.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Test-Time Speculation adapts draft models online via target-model verifications to sustain high acceptance lengths during long LLM generations.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Test-Time Speculation adapts the draft model online using target verification signals to sustain high acceptance lengths during long LLM generations.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"1c919b757e83ea07236082022a02fb152d8762dba7118768a807183edc60c748"},"source":{"id":"2605.09329","kind":"arxiv","version":2},"verdict":{"id":"cca480be-b751-4bdc-a8de-13299a44c34b","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-12T04:14:27.087400Z","strongest_claim":"TTS improves acceptance lengths over state-of-the-art speculators by up to 72% and 41% on average, with the benefits scaling with increased generation lengths.","one_line_summary":"Test-Time Speculation adapts draft models online via target-model verifications to sustain high acceptance lengths during long LLM generations.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That continuous online updates to the draft model remain stable and do not introduce latency, divergence, or quality degradation over very long generations, and that the verification signals provide sufficient training data without distribution shift issues.","pith_extraction_headline":"Test-Time Speculation adapts the draft model online using target verification signals to sustain high acceptance lengths during long LLM generations."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.09329/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"ai_meta_artifact","ran_at":"2026-05-19T20:33:37.908954Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_title_agreement","ran_at":"2026-05-19T13:01:18.638460Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T10:20:44.934083Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"fd788f1be82afe62d4dfdcbaab37e2744455d6f89b66ec115fe0c3321daf57bc"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.09329","created_at":"2026-05-20T02:05:45.516986+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.09329v2","created_at":"2026-05-20T02:05:45.516986+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.09329","created_at":"2026-05-20T02:05:45.516986+00:00"},{"alias_kind":"pith_short_12","alias_value":"PS6UM5SLUKGN","created_at":"2026-05-20T02:05:45.516986+00:00"},{"alias_kind":"pith_short_16","alias_value":"PS6UM5SLUKGNGJYH","created_at":"2026-05-20T02:05:45.516986+00:00"},{"alias_kind":"pith_short_8","alias_value":"PS6UM5SL","created_at":"2026-05-20T02:05:45.516986+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PS6UM5SLUKGNGJYHCWQ563ITYT","json":"https://pith.science/pith/PS6UM5SLUKGNGJYHCWQ563ITYT.json","graph_json":"https://pith.science/api/pith-number/PS6UM5SLUKGNGJYHCWQ563ITYT/graph.json","events_json":"https://pith.science/api/pith-number/PS6UM5SLUKGNGJYHCWQ563ITYT/events.json","paper":"https://pith.science/paper/PS6UM5SL"},"agent_actions":{"view_html":"https://pith.science/pith/PS6UM5SLUKGNGJYHCWQ563ITYT","download_json":"https://pith.science/pith/PS6UM5SLUKGNGJYHCWQ563ITYT.json","view_paper":"https://pith.science/paper/PS6UM5SL","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.09329&json=true","fetch_graph":"https://pith.science/api/pith-number/PS6UM5SLUKGNGJYHCWQ563ITYT/graph.json","fetch_events":"https://pith.science/api/pith-number/PS6UM5SLUKGNGJYHCWQ563ITYT/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PS6UM5SLUKGNGJYHCWQ563ITYT/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PS6UM5SLUKGNGJYHCWQ563ITYT/action/storage_attestation","attest_author":"https://pith.science/pith/PS6UM5SLUKGNGJYHCWQ563ITYT/action/author_attestation","sign_citation":"https://pith.science/pith/PS6UM5SLUKGNGJYHCWQ563ITYT/action/citation_signature","submit_replication":"https://pith.science/pith/PS6UM5SLUKGNGJYHCWQ563ITYT/action/replication_record"}},"created_at":"2026-05-20T02:05:45.516986+00:00","updated_at":"2026-05-20T02:05:45.516986+00:00"}