{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:3WYX72X5SQT2EY5MTNNX24CJYY","short_pith_number":"pith:3WYX72X5","canonical_record":{"source":{"id":"2404.19737","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2024-04-30T17:33:57Z","cross_cats_sorted":[],"title_canon_sha256":"193f5844a8c69bc27a3c9495377ee7c2f62d7a994b0bf3c8691d4309735d0477","abstract_canon_sha256":"416350975cb8b534fbb7c32d3860af343ee9d2b1896c0f7273bab7be92c7a500"},"schema_version":"1.0"},"canonical_sha256":"ddb17feafd9427a263ac9b5b7d7049c62847350bb03b79d8d8bae0146738cb33","source":{"kind":"arxiv","id":"2404.19737","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2404.19737","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2404.19737v1","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2404.19737","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"3WYX72X5SQT2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"3WYX72X5SQT2EY5M","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"3WYX72X5","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:3WYX72X5SQT2EY5MTNNX24CJYY","target":"record","payload":{"canonical_record":{"source":{"id":"2404.19737","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2024-04-30T17:33:57Z","cross_cats_sorted":[],"title_canon_sha256":"193f5844a8c69bc27a3c9495377ee7c2f62d7a994b0bf3c8691d4309735d0477","abstract_canon_sha256":"416350975cb8b534fbb7c32d3860af343ee9d2b1896c0f7273bab7be92c7a500"},"schema_version":"1.0"},"canonical_sha256":"ddb17feafd9427a263ac9b5b7d7049c62847350bb03b79d8d8bae0146738cb33","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:47.884803Z","signature_b64":"3sn5tgtNhew8T6iCO5Jo9BlYk4viikUEMLtAuK6BveXMPQU85DO6AOfL6w9pnahZt57BPU2tWq4LJw6x9vcqDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ddb17feafd9427a263ac9b5b7d7049c62847350bb03b79d8d8bae0146738cb33","last_reissued_at":"2026-05-17T23:38:47.884145Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:47.884145Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2404.19737","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"6fMkVkO0US+dqA7tqspf3vL0PAQKZd7ZY9o7LfXfborRQHtPwVr20oP24et2q2dnZM2DdJQIoPjUkjqpTtFXBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T22:41:27.177417Z"},"content_sha256":"8a4511a5f2a7202a700b86a22dcd3a9fc68a5d3b72dd271629283f84d12576f1","schema_version":"1.0","event_id":"sha256:8a4511a5f2a7202a700b86a22dcd3a9fc68a5d3b72dd271629283f84d12576f1"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:3WYX72X5SQT2EY5MTNNX24CJYY","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Better & Faster Large Language Models via Multi-token Prediction","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Training language models to predict multiple future tokens improves coding performance and speeds up inference","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Badr Youbi Idrissi, Baptiste Rozi\\`ere, David Lopez-Paz, Fabian Gloeckle, Gabriel Synnaeve","submitted_at":"2024-04-30T17:33:57Z","abstract_excerpt":"Large language models such as GPT and Llama are trained with a next-token prediction loss. In this work, we suggest that training language models to predict multiple future tokens at once results in higher sample efficiency. More specifically, at each position in the training corpus, we ask the model to predict the following n tokens using n independent output heads, operating on top of a shared model trunk. Considering multi-token prediction as an auxiliary training task, we measure improved downstream capabilities with no overhead in training time for both code and natural language models. T"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our 13B parameter models solves 12 % more problems on HumanEval and 17 % more on MBPP than comparable next-token models. ... models trained with 4-token prediction are up to 3 times faster at inference, even with large batch sizes.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the reported gains are caused by the multi-token auxiliary objective rather than differences in hyper-parameters, data ordering, or other uncontrolled training details, and that the benefit persists without degradation at much larger scales.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Multi-token prediction training yields higher sample efficiency, better benchmark scores on code generation, and up to 3x faster inference than standard next-token prediction for LLMs.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Training language models to predict multiple future tokens improves coding performance and speeds up inference","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"d4d49512641eba82c64914f3c3652ed359b9057ed485b987ed8e4889cf09bf72"},"source":{"id":"2404.19737","kind":"arxiv","version":1},"verdict":{"id":"ac4cbfaa-290a-4952-a366-e7a29b1d2974","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T12:21:13.563553Z","strongest_claim":"Our 13B parameter models solves 12 % more problems on HumanEval and 17 % more on MBPP than comparable next-token models. ... models trained with 4-token prediction are up to 3 times faster at inference, even with large batch sizes.","one_line_summary":"Multi-token prediction training yields higher sample efficiency, better benchmark scores on code generation, and up to 3x faster inference than standard next-token prediction for LLMs.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the reported gains are caused by the multi-token auxiliary objective rather than differences in hyper-parameters, data ordering, or other uncontrolled training details, and that the benefit persists without degradation at much larger scales.","pith_extraction_headline":"Training language models to predict multiple future tokens improves coding performance and speeds up inference"},"references":{"count":23,"sample":[{"doi":"","year":null,"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","ref_index":1,"cited_arxiv_id":"2108.07732","is_internal_anchor":true},{"doi":"","year":null,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","ref_index":2,"cited_arxiv_id":"2107.03374","is_internal_anchor":true},{"doi":"","year":null,"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","ref_index":3,"cited_arxiv_id":"2110.14168","is_internal_anchor":true},{"doi":"","year":null,"title":"High Fidelity Neural Audio Compression","work_id":"bc645d2d-e9f2-4cb8-9a6d-bd557bc7a258","ref_index":4,"cited_arxiv_id":"2210.13438","is_internal_anchor":true},{"doi":"","year":2021,"title":"Leveraging parsbert and pretrained mt5 for persian abstractive text summarization","work_id":"0934b847-0d23-40fe-839c-92f6349edf54","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":23,"snapshot_sha256":"78c12a8dcc5e16eecc078d4d56bbcb3d6b88ea83927ec1efd81170549278a083","internal_anchors":5},"formal_canon":{"evidence_count":2,"snapshot_sha256":"0f693f76cd804dd6c5607a2418482231f9912992231dcd77e6e558e9e23cc471"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"ac4cbfaa-290a-4952-a366-e7a29b1d2974"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"wiAbqUau0cOi3Rq8zCXp5xSPoqMZSZ7Ua5CG5hHzH1me5eXB0gqMJMuEEPt38KEU68vxMH1z0Vc/jNzevMeZAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T22:41:27.177943Z"},"content_sha256":"52c04d8fe8e30fe5335fa89e0b9b68a9a82ef1ad6488602757463b277436e12c","schema_version":"1.0","event_id":"sha256:52c04d8fe8e30fe5335fa89e0b9b68a9a82ef1ad6488602757463b277436e12c"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/3WYX72X5SQT2EY5MTNNX24CJYY/bundle.json","state_url":"https://pith.science/pith/3WYX72X5SQT2EY5MTNNX24CJYY/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/3WYX72X5SQT2EY5MTNNX24CJYY/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-02T22:41:27Z","links":{"resolver":"https://pith.science/pith/3WYX72X5SQT2EY5MTNNX24CJYY","bundle":"https://pith.science/pith/3WYX72X5SQT2EY5MTNNX24CJYY/bundle.json","state":"https://pith.science/pith/3WYX72X5SQT2EY5MTNNX24CJYY/state.json","well_known_bundle":"https://pith.science/.well-known/pith/3WYX72X5SQT2EY5MTNNX24CJYY/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:3WYX72X5SQT2EY5MTNNX24CJYY","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"416350975cb8b534fbb7c32d3860af343ee9d2b1896c0f7273bab7be92c7a500","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2024-04-30T17:33:57Z","title_canon_sha256":"193f5844a8c69bc27a3c9495377ee7c2f62d7a994b0bf3c8691d4309735d0477"},"schema_version":"1.0","source":{"id":"2404.19737","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2404.19737","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2404.19737v1","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2404.19737","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"3WYX72X5SQT2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"3WYX72X5SQT2EY5M","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"3WYX72X5","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:52c04d8fe8e30fe5335fa89e0b9b68a9a82ef1ad6488602757463b277436e12c","target":"graph","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Our 13B parameter models solves 12 % more problems on HumanEval and 17 % more on MBPP than comparable next-token models. ... models trained with 4-token prediction are up to 3 times faster at inference, even with large batch sizes."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the reported gains are caused by the multi-token auxiliary objective rather than differences in hyper-parameters, data ordering, or other uncontrolled training details, and that the benefit persists without degradation at much larger scales."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Multi-token prediction training yields higher sample efficiency, better benchmark scores on code generation, and up to 3x faster inference than standard next-token prediction for LLMs."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Training language models to predict multiple future tokens improves coding performance and speeds up inference"}],"snapshot_sha256":"d4d49512641eba82c64914f3c3652ed359b9057ed485b987ed8e4889cf09bf72"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"0f693f76cd804dd6c5607a2418482231f9912992231dcd77e6e558e9e23cc471"},"paper":{"abstract_excerpt":"Large language models such as GPT and Llama are trained with a next-token prediction loss. In this work, we suggest that training language models to predict multiple future tokens at once results in higher sample efficiency. More specifically, at each position in the training corpus, we ask the model to predict the following n tokens using n independent output heads, operating on top of a shared model trunk. Considering multi-token prediction as an auxiliary training task, we measure improved downstream capabilities with no overhead in training time for both code and natural language models. T","authors_text":"Badr Youbi Idrissi, Baptiste Rozi\\`ere, David Lopez-Paz, Fabian Gloeckle, Gabriel Synnaeve","cross_cats":[],"headline":"Training language models to predict multiple future tokens improves coding performance and speeds up inference","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2024-04-30T17:33:57Z","title":"Better & Faster Large Language Models via Multi-token Prediction"},"references":{"count":23,"internal_anchors":5,"resolved_work":23,"sample":[{"cited_arxiv_id":"2108.07732","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","year":null},{"cited_arxiv_id":"2107.03374","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","year":null},{"cited_arxiv_id":"2110.14168","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","year":null},{"cited_arxiv_id":"2210.13438","doi":"","is_internal_anchor":true,"ref_index":4,"title":"High Fidelity Neural Audio Compression","work_id":"bc645d2d-e9f2-4cb8-9a6d-bd557bc7a258","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Leveraging parsbert and pretrained mt5 for persian abstractive text summarization","work_id":"0934b847-0d23-40fe-839c-92f6349edf54","year":2021}],"snapshot_sha256":"78c12a8dcc5e16eecc078d4d56bbcb3d6b88ea83927ec1efd81170549278a083"},"source":{"id":"2404.19737","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-16T12:21:13.563553Z","id":"ac4cbfaa-290a-4952-a366-e7a29b1d2974","model_set":{"reader":"grok-4.3"},"one_line_summary":"Multi-token prediction training yields higher sample efficiency, better benchmark scores on code generation, and up to 3x faster inference than standard next-token prediction for LLMs.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Training language models to predict multiple future tokens improves coding performance and speeds up inference","strongest_claim":"Our 13B parameter models solves 12 % more problems on HumanEval and 17 % more on MBPP than comparable next-token models. ... models trained with 4-token prediction are up to 3 times faster at inference, even with large batch sizes.","weakest_assumption":"That the reported gains are caused by the multi-token auxiliary objective rather than differences in hyper-parameters, data ordering, or other uncontrolled training details, and that the benefit persists without degradation at much larger scales."}},"verdict_id":"ac4cbfaa-290a-4952-a366-e7a29b1d2974"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:8a4511a5f2a7202a700b86a22dcd3a9fc68a5d3b72dd271629283f84d12576f1","target":"record","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"416350975cb8b534fbb7c32d3860af343ee9d2b1896c0f7273bab7be92c7a500","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2024-04-30T17:33:57Z","title_canon_sha256":"193f5844a8c69bc27a3c9495377ee7c2f62d7a994b0bf3c8691d4309735d0477"},"schema_version":"1.0","source":{"id":"2404.19737","kind":"arxiv","version":1}},"canonical_sha256":"ddb17feafd9427a263ac9b5b7d7049c62847350bb03b79d8d8bae0146738cb33","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"ddb17feafd9427a263ac9b5b7d7049c62847350bb03b79d8d8bae0146738cb33","first_computed_at":"2026-05-17T23:38:47.884145Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:47.884145Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"3sn5tgtNhew8T6iCO5Jo9BlYk4viikUEMLtAuK6BveXMPQU85DO6AOfL6w9pnahZt57BPU2tWq4LJw6x9vcqDQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:47.884803Z","signed_message":"canonical_sha256_bytes"},"source_id":"2404.19737","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:8a4511a5f2a7202a700b86a22dcd3a9fc68a5d3b72dd271629283f84d12576f1","sha256:52c04d8fe8e30fe5335fa89e0b9b68a9a82ef1ad6488602757463b277436e12c"],"state_sha256":"0f285072d257782d1728481ce8cfcf003c91cf62d5251a6359c9bf3035754cb3"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"wXKOtEYWAivXKjvL+daDBte2yIlTnVKC+iEZZs5aXZEkGE6gQL43ZGxSXOxzcqvoxbbmkWg4wHJhlzK+M1C6Dw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-02T22:41:27.180412Z","bundle_sha256":"fbf8139ee0cd605bbfd7ffa36788190cf32ab4152906ae9d4ede03edfbb0841c"}}