{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:W6PEMK224EFCSDJFXIVW6DEP2J","short_pith_number":"pith:W6PEMK22","schema_version":"1.0","canonical_sha256":"b79e462b5ae10a290d25ba2b6f0c8fd27b63fa39eb917f7f0f46dd8139d78f02","source":{"kind":"arxiv","id":"2605.20865","version":1},"attestation_state":"computed","paper":{"title":"Multi-Step Likelihood-Ratio Correction for Reinforcement Learning with Verifiable Rewards","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Byeongchan Kim, Deokgyu Yoon, Gyungin Shin, Hyungkyu Kang, Joongkyu Lee, Min-hwan Oh, Sungrae Park","submitted_at":"2026-05-20T08:01:01Z","abstract_excerpt":"Reinforcement learning with verifiable rewards (RLVR) plays a pivotal role in improving the reasoning ability of large language models. However, widely used PPO surrogate objectives are fundamentally local, as they rely on a local approximation of the exact policy gradient objective. While this approximation improves stability by reducing the variance induced by importance sampling, it also introduces structural bias into the surrogate objective, which must be controlled through trust region mechanisms. In this work, we introduce the $N$-step forward trace, which augments the PPO surrogate obj"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.20865","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-20T08:01:01Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"a849d6d96b95676fd372a94be2e383f3dc6ef552de13c41d92356a464cd2cc90","abstract_canon_sha256":"06fb631741cf1ba35803f6601ca7d31d63f3163a426b82e0844ac31f1d28e4ec"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T01:05:25.254223Z","signature_b64":"GBukB5WYVTARW3IrDgg0+StIx8QWfSpiTJ5lfscwWUJ1ntsGoX2wjFk1mK+fUuqQzYjKsoAEWg5rqnN5m/XdBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b79e462b5ae10a290d25ba2b6f0c8fd27b63fa39eb917f7f0f46dd8139d78f02","last_reissued_at":"2026-05-21T01:05:25.253615Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T01:05:25.253615Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Multi-Step Likelihood-Ratio Correction for Reinforcement Learning with Verifiable Rewards","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Byeongchan Kim, Deokgyu Yoon, Gyungin Shin, Hyungkyu Kang, Joongkyu Lee, Min-hwan Oh, Sungrae Park","submitted_at":"2026-05-20T08:01:01Z","abstract_excerpt":"Reinforcement learning with verifiable rewards (RLVR) plays a pivotal role in improving the reasoning ability of large language models. However, widely used PPO surrogate objectives are fundamentally local, as they rely on a local approximation of the exact policy gradient objective. While this approximation improves stability by reducing the variance induced by importance sampling, it also introduces structural bias into the surrogate objective, which must be controlled through trust region mechanisms. In this work, we introduce the $N$-step forward trace, which augments the PPO surrogate obj"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.20865","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.20865/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.20865","created_at":"2026-05-21T01:05:25.253722+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.20865v1","created_at":"2026-05-21T01:05:25.253722+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.20865","created_at":"2026-05-21T01:05:25.253722+00:00"},{"alias_kind":"pith_short_12","alias_value":"W6PEMK224EFC","created_at":"2026-05-21T01:05:25.253722+00:00"},{"alias_kind":"pith_short_16","alias_value":"W6PEMK224EFCSDJF","created_at":"2026-05-21T01:05:25.253722+00:00"},{"alias_kind":"pith_short_8","alias_value":"W6PEMK22","created_at":"2026-05-21T01:05:25.253722+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/W6PEMK224EFCSDJFXIVW6DEP2J","json":"https://pith.science/pith/W6PEMK224EFCSDJFXIVW6DEP2J.json","graph_json":"https://pith.science/api/pith-number/W6PEMK224EFCSDJFXIVW6DEP2J/graph.json","events_json":"https://pith.science/api/pith-number/W6PEMK224EFCSDJFXIVW6DEP2J/events.json","paper":"https://pith.science/paper/W6PEMK22"},"agent_actions":{"view_html":"https://pith.science/pith/W6PEMK224EFCSDJFXIVW6DEP2J","download_json":"https://pith.science/pith/W6PEMK224EFCSDJFXIVW6DEP2J.json","view_paper":"https://pith.science/paper/W6PEMK22","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.20865&json=true","fetch_graph":"https://pith.science/api/pith-number/W6PEMK224EFCSDJFXIVW6DEP2J/graph.json","fetch_events":"https://pith.science/api/pith-number/W6PEMK224EFCSDJFXIVW6DEP2J/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/W6PEMK224EFCSDJFXIVW6DEP2J/action/timestamp_anchor","attest_storage":"https://pith.science/pith/W6PEMK224EFCSDJFXIVW6DEP2J/action/storage_attestation","attest_author":"https://pith.science/pith/W6PEMK224EFCSDJFXIVW6DEP2J/action/author_attestation","sign_citation":"https://pith.science/pith/W6PEMK224EFCSDJFXIVW6DEP2J/action/citation_signature","submit_replication":"https://pith.science/pith/W6PEMK224EFCSDJFXIVW6DEP2J/action/replication_record"}},"created_at":"2026-05-21T01:05:25.253722+00:00","updated_at":"2026-05-21T01:05:25.253722+00:00"}