{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:VXY2QJRMYB5BCEU2CDI2UN2VCC","short_pith_number":"pith:VXY2QJRM","schema_version":"1.0","canonical_sha256":"adf1a8262cc07a11129a10d1aa3755109abcb75eeeff24231bd3373b15b6f058","source":{"kind":"arxiv","id":"2605.14220","version":1},"attestation_state":"computed","paper":{"title":"Diagnosing Training Inference Mismatch in LLM Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Small token-level numerical disagreements can independently cause training collapse in LLM reinforcement learning.","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Geoffrey Fox, Neiwen Ling, Peng Wu, Tianle Zhong, Tianshu Yu, Xiao Yu, Yifan Pi, Zijun Wei","submitted_at":"2026-05-14T00:27:35Z","abstract_excerpt":"Modern LLM RL systems separate rollout generation from policy optimization. These two stages are expected to produce token probabilities that match exactly. However, implementation differences can make them assign different values to the same sequence under the same model weights, inducing Training-Inference Mismatch (TIM). TIM is difficult to inspect because it is entangled with off-policy drift and common stabilization mechanisms. In this work, we isolate TIM in a zero-mismatch diagnostic setting (VeXact), and show that small token-level numerical disagreements can independently cause traini"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.14220","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T00:27:35Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"b3906a1c2350895f3131cb1dd66a543835acac8b02f98ba394dcc8c2edf47924","abstract_canon_sha256":"7194c730f30bb55b8b96467914e6d0ecc1a85d0c86623a37e0c1509ba2261ac8"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:10.831901Z","signature_b64":"u7BlN/RW3URKl1qv9CFrQWTQdgfMbOK1+4mh1XcT7UfjolPLNLbKus059HQo/DNccQT94I4PmAjqVBxWFkpBAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"adf1a8262cc07a11129a10d1aa3755109abcb75eeeff24231bd3373b15b6f058","last_reissued_at":"2026-05-17T23:39:10.831276Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:10.831276Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Diagnosing Training Inference Mismatch in LLM Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Small token-level numerical disagreements can independently cause training collapse in LLM reinforcement learning.","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Geoffrey Fox, Neiwen Ling, Peng Wu, Tianle Zhong, Tianshu Yu, Xiao Yu, Yifan Pi, Zijun Wei","submitted_at":"2026-05-14T00:27:35Z","abstract_excerpt":"Modern LLM RL systems separate rollout generation from policy optimization. These two stages are expected to produce token probabilities that match exactly. However, implementation differences can make them assign different values to the same sequence under the same model weights, inducing Training-Inference Mismatch (TIM). TIM is difficult to inspect because it is entangled with off-policy drift and common stabilization mechanisms. In this work, we isolate TIM in a zero-mismatch diagnostic setting (VeXact), and show that small token-level numerical disagreements can independently cause traini"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"small token-level numerical disagreements can independently cause training collapse","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The VeXact diagnostic setting successfully isolates TIM from off-policy drift and stabilization mechanisms without introducing new artifacts.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Training-inference mismatch in separated rollout and optimization stages of LLM RL can independently cause training collapse.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Small token-level numerical disagreements can independently cause training collapse in LLM reinforcement learning.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"510291bcb8c2e447d1dfd168ee475f6ed45940bd76dd1db812266d5429e1ad59"},"source":{"id":"2605.14220","kind":"arxiv","version":1},"verdict":{"id":"45215c6a-99a7-4f94-87e0-1519825b8a4c","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T02:01:58.461730Z","strongest_claim":"small token-level numerical disagreements can independently cause training collapse","one_line_summary":"Training-inference mismatch in separated rollout and optimization stages of LLM RL can independently cause training collapse.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The VeXact diagnostic setting successfully isolates TIM from off-policy drift and stabilization mechanisms without introducing new artifacts.","pith_extraction_headline":"Small token-level numerical disagreements can independently cause training collapse in LLM reinforcement learning."},"references":{"count":53,"sample":[{"doi":"","year":2025,"title":"Every Step Evolves: Scaling Reinforcement Learning for Trillion-Scale Thinking Model , author=. 2025 , eprint=","work_id":"b7cd4698-3cfb-43f3-9b78-f450a82291b8","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Llms can learn to reason via off-policy rl","work_id":"17c61d97-2163-4101-81d6-c7650b930ca4","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Thinking Machines Lab: Connectionism , year =","work_id":"d9c0bd4d-412c-4aa4-80f2-ffdb6caccd02","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"DeepSeek-V3 Technical Report , author=. 2025 , eprint=","work_id":"58376947-1a64-4892-a13c-2eb8e0ab2c9f","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model , author=. 2024 , eprint=","work_id":"c340eca6-fc11-45b6-a0d7-068752a731c8","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":53,"snapshot_sha256":"6540aaefb7526ff2a5d4d3f257cc8664dcc07e956efedf6bb9bb0af2ce4ff9ba","internal_anchors":6},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.14220","created_at":"2026-05-17T23:39:10.831391+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.14220v1","created_at":"2026-05-17T23:39:10.831391+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14220","created_at":"2026-05-17T23:39:10.831391+00:00"},{"alias_kind":"pith_short_12","alias_value":"VXY2QJRMYB5B","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"VXY2QJRMYB5BCEU2","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"VXY2QJRM","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VXY2QJRMYB5BCEU2CDI2UN2VCC","json":"https://pith.science/pith/VXY2QJRMYB5BCEU2CDI2UN2VCC.json","graph_json":"https://pith.science/api/pith-number/VXY2QJRMYB5BCEU2CDI2UN2VCC/graph.json","events_json":"https://pith.science/api/pith-number/VXY2QJRMYB5BCEU2CDI2UN2VCC/events.json","paper":"https://pith.science/paper/VXY2QJRM"},"agent_actions":{"view_html":"https://pith.science/pith/VXY2QJRMYB5BCEU2CDI2UN2VCC","download_json":"https://pith.science/pith/VXY2QJRMYB5BCEU2CDI2UN2VCC.json","view_paper":"https://pith.science/paper/VXY2QJRM","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.14220&json=true","fetch_graph":"https://pith.science/api/pith-number/VXY2QJRMYB5BCEU2CDI2UN2VCC/graph.json","fetch_events":"https://pith.science/api/pith-number/VXY2QJRMYB5BCEU2CDI2UN2VCC/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VXY2QJRMYB5BCEU2CDI2UN2VCC/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VXY2QJRMYB5BCEU2CDI2UN2VCC/action/storage_attestation","attest_author":"https://pith.science/pith/VXY2QJRMYB5BCEU2CDI2UN2VCC/action/author_attestation","sign_citation":"https://pith.science/pith/VXY2QJRMYB5BCEU2CDI2UN2VCC/action/citation_signature","submit_replication":"https://pith.science/pith/VXY2QJRMYB5BCEU2CDI2UN2VCC/action/replication_record"}},"created_at":"2026-05-17T23:39:10.831391+00:00","updated_at":"2026-05-17T23:39:10.831391+00:00"}