{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:MT66L2EJ7JDPLD4655GWIFMOJ6","short_pith_number":"pith:MT66L2EJ","canonical_record":{"source":{"id":"2605.05112","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-06T16:44:38Z","cross_cats_sorted":[],"title_canon_sha256":"ff78394a3014356faa26a65fa7d1b9539095febbe427226953a86d4b8c1b7bf9","abstract_canon_sha256":"3aa61b8262eb85c21d150031d0ca3420ae03f2ea67a89971bc860db972c39f44"},"schema_version":"1.0"},"canonical_sha256":"64fde5e889fa46f58f9eef4d64158e4fb4e82bc2cd8165a1ca2a2fa621389270","source":{"kind":"arxiv","id":"2605.05112","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.05112","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"arxiv_version","alias_value":"2605.05112v3","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.05112","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"pith_short_12","alias_value":"MT66L2EJ7JDP","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"pith_short_16","alias_value":"MT66L2EJ7JDPLD46","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"pith_short_8","alias_value":"MT66L2EJ","created_at":"2026-05-20T00:01:42Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:MT66L2EJ7JDPLD4655GWIFMOJ6","target":"record","payload":{"canonical_record":{"source":{"id":"2605.05112","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-06T16:44:38Z","cross_cats_sorted":[],"title_canon_sha256":"ff78394a3014356faa26a65fa7d1b9539095febbe427226953a86d4b8c1b7bf9","abstract_canon_sha256":"3aa61b8262eb85c21d150031d0ca3420ae03f2ea67a89971bc860db972c39f44"},"schema_version":"1.0"},"canonical_sha256":"64fde5e889fa46f58f9eef4d64158e4fb4e82bc2cd8165a1ca2a2fa621389270","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:01:42.843208Z","signature_b64":"pvNDJ4Tqc3SlQBb3X1lwZ6dhXU8A1L5s5ta1SIPv6m4gdDuDNueID3goWsSX+UTyIq33xExSgfqUdXtp3ycJBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"64fde5e889fa46f58f9eef4d64158e4fb4e82bc2cd8165a1ca2a2fa621389270","last_reissued_at":"2026-05-20T00:01:42.842444Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:01:42.842444Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.05112","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:01:42Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"6Jxipanw/f5hQhM8K6nx1LJlKalfZ/TRI8Y3D9dUBjWOGdBaCAnt2dwWiVbU51/s3JJBXMjr4b9S64jMT6XJDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T04:55:14.709939Z"},"content_sha256":"5931413dcc68ec370a4fada7d9c6f7c5ada38596d415414a149b8f9889d7e7ee","schema_version":"1.0","event_id":"sha256:5931413dcc68ec370a4fada7d9c6f7c5ada38596d415414a149b8f9889d7e7ee"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:MT66L2EJ7JDPLD4655GWIFMOJ6","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Rollout Pass-Rate Control: Steering Binary-Reward RL Toward Its Most Informative Regime","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Steering rollout pass rates to 50 percent strengthens binary-reward signals in agentic RL.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Dawei Yin, Daxiang Dong, Dou Shen, Haotian Zhao, Jianmin Wu, Jingnan Gu, Lun Tian, Tianshu Zhu, Wenyu Zhang, Xiaoying Zuo, Yucheng Zeng","submitted_at":"2026-05-06T16:44:38Z","abstract_excerpt":"Agentic reinforcement learning (RL) for software engineering spends much of its compute on stateful trajectories whose grouped binary rewards are highly skewed and weakly contrastive. We frame this as pass-rate control and show that the binary reward-side signal is strongest near a 50% rollout pass rate under four criteria: reward entropy, group-filtering survival, leave-one-out (RLOO) advantage energy under Group Relative Policy Optimization (GRPO), and success-failure pair count. We propose Prefix Sampling (PS), which replays self-generated trajectory prefixes to steer skewed groups toward t"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"The binary reward-side signal is strongest near a 50% rollout pass rate under four criteria: reward entropy, group-filtering survival, leave-one-out (RLOO) advantage energy under GRPO, and success-failure pair count. Prefix Sampling steers groups toward this regime and yields 2.01x and 1.55x wall-clock speedups while matching or exceeding baseline scores.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That replaying prefixes from prior trajectories and masking their tokens will steer pass rates to the informative regime without introducing systematic bias into the policy gradient or destabilizing GRPO optimization.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Prefix Sampling replays self-generated trajectory prefixes to control rollout pass rates near 50% in binary-reward RL, delivering wall-clock speedups and modest performance gains on SWE-bench Verified and AIME tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Steering rollout pass rates to 50 percent strengthens binary-reward signals in agentic RL.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"fd7622f75ae27df196f6a6f96d70a7e52826d6bb150b484c646cb9c334f134da"},"source":{"id":"2605.05112","kind":"arxiv","version":3},"verdict":{"id":"9325516a-2470-4889-906a-4b7aa82083c1","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T17:28:23.118864Z","strongest_claim":"The binary reward-side signal is strongest near a 50% rollout pass rate under four criteria: reward entropy, group-filtering survival, leave-one-out (RLOO) advantage energy under GRPO, and success-failure pair count. Prefix Sampling steers groups toward this regime and yields 2.01x and 1.55x wall-clock speedups while matching or exceeding baseline scores.","one_line_summary":"Prefix Sampling replays self-generated trajectory prefixes to control rollout pass rates near 50% in binary-reward RL, delivering wall-clock speedups and modest performance gains on SWE-bench Verified and AIME tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That replaying prefixes from prior trajectories and masking their tokens will steer pass rates to the informative regime without introducing systematic bias into the policy gradient or destabilizing GRPO optimization.","pith_extraction_headline":"Steering rollout pass rates to 50 percent strengthens binary-reward signals in agentic RL."},"integrity":{"clean":false,"summary":{"advisory":2,"critical":0,"by_detector":{"doi_compliance":{"total":2,"advisory":2,"critical":0,"informational":0}},"informational":0},"endpoint":"/pith/2605.05112/integrity.json","findings":[{"note":"DOI in the printed bibliography is fragmented by whitespace or line breaks. A longer candidate (10.48550/arXiv.1707.06347.URLhttps://arxiv.org/abs/1707.06347) was visible in the surrounding text but could not be confirmed against doi.org as printed.","detector":"doi_compliance","severity":"advisory","ref_index":25,"audited_at":"2026-05-19T13:49:28.699885Z","detected_doi":"10.48550/arXiv.1707.06347.URLhttps://arxiv.org/abs/1707.06347","finding_type":"recoverable_identifier","verdict_class":"incontrovertible","detected_arxiv_id":null},{"note":"DOI in the printed bibliography is fragmented by whitespace or line breaks. A longer candidate (10.48550/arXiv.2505.16400.URLhttps://arxiv.org/abs/2505.16400) was visible in the surrounding text but could not be confirmed against doi.org as printed.","detector":"doi_compliance","severity":"advisory","ref_index":32,"audited_at":"2026-05-19T13:49:28.699885Z","detected_doi":"10.48550/arXiv.2505.16400.URLhttps://arxiv.org/abs/2505.16400","finding_type":"recoverable_identifier","verdict_class":"incontrovertible","detected_arxiv_id":null}],"available":true,"detectors_run":[{"name":"doi_title_agreement","ran_at":"2026-05-19T21:31:19.562055Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T13:49:28.699885Z","status":"completed","version":"1.0.0","findings_count":2}],"snapshot_sha256":"060df40dbd2eb36c3189bf00c8bb7f86f84f4260deb4c07f9488f576720145d3"},"references":{"count":37,"sample":[{"doi":"10.48550/arxiv.2402.03300","year":2024,"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","ref_index":1,"cited_arxiv_id":"2402.03300","is_internal_anchor":true},{"doi":"10.48550/arxiv.2503.14476","year":2025,"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","ref_index":2,"cited_arxiv_id":"2503.14476","is_internal_anchor":true},{"doi":"","year":2025,"title":"DeepSWE: Training a fully open-sourced, state-of-the-art coding agent by scaling RL","work_id":"6fc5b99c-abe6-4a52-94f7-53dbf6454fc5","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Let it flow: Agentic crafting on rock and roll, building the rome model within an open agentic learning ecosystem.arXiv preprint","work_id":"a1afde43-96e3-49c9-af14-25f128d65fe3","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2026,"title":"Reuse your FLOPs: Scaling RL on hard problems by conditioning on very off-policy prefixes.CoRR, abs/2601.18795","work_id":"2be8a44f-c445-4fc6-9eec-82c26f802afc","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":37,"snapshot_sha256":"39f4506ac935ea9ff60270a0cd9b36327c3a953bee76b55bb3a6083483f326b6","internal_anchors":7},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"9325516a-2470-4889-906a-4b7aa82083c1"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:01:42Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Pe26iEMs68JNdF6V+MDxkCrWsOsc9G0CmVcp2aegGN+tTxgz5kn1oabhrOSayn1sjo71LgfU/+FULoW9Naw0AQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T04:55:14.710630Z"},"content_sha256":"05c9ff4aff9d88ba2330d58a74ba3b050a1c8e43b88acb7e11634472ed17b006","schema_version":"1.0","event_id":"sha256:05c9ff4aff9d88ba2330d58a74ba3b050a1c8e43b88acb7e11634472ed17b006"},{"event_type":"integrity_finding","subject_pith_number":"pith:2026:MT66L2EJ7JDPLD4655GWIFMOJ6","target":"integrity","payload":{"note":"DOI in the printed bibliography is fragmented by whitespace or line breaks. A longer candidate (10.48550/arXiv.2505.16400.URLhttps://arxiv.org/abs/2505.16400) was visible in the surrounding text but could not be confirmed against doi.org as printed.","snippet":"Yang Chen, Zhuolin Yang, Zihan Liu, Chankyu Lee, Peng Xu, Mohammad Shoeybi, Bryan Catanzaro, and Wei Ping. AceReason-Nemotron: Advancing math and code reasoning through reinforcement learning.arXiv preprint arXiv:2505.16400, 2025. doi: 10.4","arxiv_id":"2605.05112","detector":"doi_compliance","evidence":{"ref_index":32,"verdict_class":"incontrovertible","resolved_title":null,"printed_excerpt":"Yang Chen, Zhuolin Yang, Zihan Liu, Chankyu Lee, Peng Xu, Mohammad Shoeybi, Bryan Catanzaro, and Wei Ping. AceReason-Nemotron: Advancing math and code reasoning through reinforcement learning.arXiv preprint arXiv:2505.16400, 2025. doi: 10.4","reconstructed_doi":"10.48550/arXiv.2505.16400.URLhttps://arxiv.org/abs/2505.16400"},"severity":"advisory","ref_index":32,"audited_at":"2026-05-19T13:49:28.699885Z","event_type":"pith.integrity.v1","detected_doi":"10.48550/arXiv.2505.16400.URLhttps://arxiv.org/abs/2505.16400","detector_url":"https://pith.science/pith-integrity-protocol#doi_compliance","external_url":null,"finding_type":"recoverable_identifier","evidence_hash":"550025d77d679d83162bdde4e89cb235a1d12413b4eea4b29714c6473295b589","paper_version":2,"verdict_class":"incontrovertible","resolved_title":null,"detector_version":"1.0.0","detected_arxiv_id":null,"integrity_event_id":1525,"payload_sha256":"18bc7f1330592370a13bbaa4224e1fc7677d0ab16df1113c226fe9fa339dd11c","signature_b64":"lMQhmMP+N97a4iW2mwMNYvyAptTWmFs9zqnia7yJDsCBSBkfS3spze0QnveS0fTYdanRpXu7/vPnDml8TKwrCw==","signing_key_id":"pith-v1-2026-05"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-19T13:52:05Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ykLcDWaPHkoWrT7Y7UrbRGO98+o5K+J718EQzv/9PDPThlGBaAwVBwBF0HklvptE4Jvz0Yi1vYk3I5JEV2HHAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T04:55:14.711723Z"},"content_sha256":"d9dba3b3523b1b19eaa0dc23de27605d95e900da21a0c364366d188bcf4fbb55","schema_version":"1.0","event_id":"sha256:d9dba3b3523b1b19eaa0dc23de27605d95e900da21a0c364366d188bcf4fbb55"},{"event_type":"integrity_finding","subject_pith_number":"pith:2026:MT66L2EJ7JDPLD4655GWIFMOJ6","target":"integrity","payload":{"note":"DOI in the printed bibliography is fragmented by whitespace or line breaks. A longer candidate (10.48550/arXiv.1707.06347.URLhttps://arxiv.org/abs/1707.06347) was visible in the surrounding text but could not be confirmed against doi.org as printed.","snippet":"John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms.arXiv preprint arXiv:1707.06347, 2017. doi: 10.48550/arXiv. 1707.06347. URLhttps://arxiv.org/abs/1707.06347","arxiv_id":"2605.05112","detector":"doi_compliance","evidence":{"ref_index":25,"verdict_class":"incontrovertible","resolved_title":null,"printed_excerpt":"John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms.arXiv preprint arXiv:1707.06347, 2017. doi: 10.48550/arXiv. 1707.06347. URLhttps://arxiv.org/abs/1707.06347","reconstructed_doi":"10.48550/arXiv.1707.06347.URLhttps://arxiv.org/abs/1707.06347"},"severity":"advisory","ref_index":25,"audited_at":"2026-05-19T13:49:28.699885Z","event_type":"pith.integrity.v1","detected_doi":"10.48550/arXiv.1707.06347.URLhttps://arxiv.org/abs/1707.06347","detector_url":"https://pith.science/pith-integrity-protocol#doi_compliance","external_url":null,"finding_type":"recoverable_identifier","evidence_hash":"3c44e83298ad9585784725f4c23caf7df002555127c256755e0017a239691be0","paper_version":2,"verdict_class":"incontrovertible","resolved_title":null,"detector_version":"1.0.0","detected_arxiv_id":null,"integrity_event_id":1524,"payload_sha256":"5f91acea97cd4825e1c5e973f0ea21e77376058bf5c40838dbeb18598ffd2a35","signature_b64":"lE+vV86vGiWFgeb7DjDndWR9DPhnFxn6PxZCZVuZ35kb0Yq3fJ+C8bEgda3fXG6cAuLTleMChEtGRmql4tgdDw==","signing_key_id":"pith-v1-2026-05"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-19T13:52:05Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"M8AcxSWHPfETMkfypf4bUVOT6fTWJYnF24bxJu+gduNhBSoN522K4jQ146Ey1Xj3eIEgplRz06y+1oKWi3fUDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T04:55:14.712011Z"},"content_sha256":"21fc751c656e5720262086b26b09fe4f5f4548ff8e9770dcbf8a0980dfca004d","schema_version":"1.0","event_id":"sha256:21fc751c656e5720262086b26b09fe4f5f4548ff8e9770dcbf8a0980dfca004d"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/MT66L2EJ7JDPLD4655GWIFMOJ6/bundle.json","state_url":"https://pith.science/pith/MT66L2EJ7JDPLD4655GWIFMOJ6/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/MT66L2EJ7JDPLD4655GWIFMOJ6/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T04:55:14Z","links":{"resolver":"https://pith.science/pith/MT66L2EJ7JDPLD4655GWIFMOJ6","bundle":"https://pith.science/pith/MT66L2EJ7JDPLD4655GWIFMOJ6/bundle.json","state":"https://pith.science/pith/MT66L2EJ7JDPLD4655GWIFMOJ6/state.json","well_known_bundle":"https://pith.science/.well-known/pith/MT66L2EJ7JDPLD4655GWIFMOJ6/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:MT66L2EJ7JDPLD4655GWIFMOJ6","merge_version":"pith-open-graph-merge-v1","event_count":4,"valid_event_count":4,"invalid_event_count":0,"equivocation_count":1,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"3aa61b8262eb85c21d150031d0ca3420ae03f2ea67a89971bc860db972c39f44","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-06T16:44:38Z","title_canon_sha256":"ff78394a3014356faa26a65fa7d1b9539095febbe427226953a86d4b8c1b7bf9"},"schema_version":"1.0","source":{"id":"2605.05112","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.05112","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"arxiv_version","alias_value":"2605.05112v3","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.05112","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"pith_short_12","alias_value":"MT66L2EJ7JDP","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"pith_short_16","alias_value":"MT66L2EJ7JDPLD46","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"pith_short_8","alias_value":"MT66L2EJ","created_at":"2026-05-20T00:01:42Z"}],"graph_snapshots":[{"event_id":"sha256:05c9ff4aff9d88ba2330d58a74ba3b050a1c8e43b88acb7e11634472ed17b006","target":"graph","created_at":"2026-05-20T00:01:42Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"The binary reward-side signal is strongest near a 50% rollout pass rate under four criteria: reward entropy, group-filtering survival, leave-one-out (RLOO) advantage energy under GRPO, and success-failure pair count. Prefix Sampling steers groups toward this regime and yields 2.01x and 1.55x wall-clock speedups while matching or exceeding baseline scores."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That replaying prefixes from prior trajectories and masking their tokens will steer pass rates to the informative regime without introducing systematic bias into the policy gradient or destabilizing GRPO optimization."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Prefix Sampling replays self-generated trajectory prefixes to control rollout pass rates near 50% in binary-reward RL, delivering wall-clock speedups and modest performance gains on SWE-bench Verified and AIME tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Steering rollout pass rates to 50 percent strengthens binary-reward signals in agentic RL."}],"snapshot_sha256":"fd7622f75ae27df196f6a6f96d70a7e52826d6bb150b484c646cb9c334f134da"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":false,"detectors_run":[{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T21:31:19.562055Z","status":"completed","version":"1.0.0"},{"findings_count":2,"name":"doi_compliance","ran_at":"2026-05-19T13:49:28.699885Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2605.05112/integrity.json","findings":[{"audited_at":"2026-05-19T13:49:28.699885Z","detected_arxiv_id":null,"detected_doi":"10.48550/arXiv.1707.06347.URLhttps://arxiv.org/abs/1707.06347","detector":"doi_compliance","finding_type":"recoverable_identifier","note":"DOI in the printed bibliography is fragmented by whitespace or line breaks. A longer candidate (10.48550/arXiv.1707.06347.URLhttps://arxiv.org/abs/1707.06347) was visible in the surrounding text but could not be confirmed against doi.org as printed.","ref_index":25,"severity":"advisory","verdict_class":"incontrovertible"},{"audited_at":"2026-05-19T13:49:28.699885Z","detected_arxiv_id":null,"detected_doi":"10.48550/arXiv.2505.16400.URLhttps://arxiv.org/abs/2505.16400","detector":"doi_compliance","finding_type":"recoverable_identifier","note":"DOI in the printed bibliography is fragmented by whitespace or line breaks. A longer candidate (10.48550/arXiv.2505.16400.URLhttps://arxiv.org/abs/2505.16400) was visible in the surrounding text but could not be confirmed against doi.org as printed.","ref_index":32,"severity":"advisory","verdict_class":"incontrovertible"}],"snapshot_sha256":"060df40dbd2eb36c3189bf00c8bb7f86f84f4260deb4c07f9488f576720145d3","summary":{"advisory":2,"by_detector":{"doi_compliance":{"advisory":2,"critical":0,"informational":0,"total":2}},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Agentic reinforcement learning (RL) for software engineering spends much of its compute on stateful trajectories whose grouped binary rewards are highly skewed and weakly contrastive. We frame this as pass-rate control and show that the binary reward-side signal is strongest near a 50% rollout pass rate under four criteria: reward entropy, group-filtering survival, leave-one-out (RLOO) advantage energy under Group Relative Policy Optimization (GRPO), and success-failure pair count. We propose Prefix Sampling (PS), which replays self-generated trajectory prefixes to steer skewed groups toward t","authors_text":"Dawei Yin, Daxiang Dong, Dou Shen, Haotian Zhao, Jianmin Wu, Jingnan Gu, Lun Tian, Tianshu Zhu, Wenyu Zhang, Xiaoying Zuo, Yucheng Zeng","cross_cats":[],"headline":"Steering rollout pass rates to 50 percent strengthens binary-reward signals in agentic RL.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-06T16:44:38Z","title":"Rollout Pass-Rate Control: Steering Binary-Reward RL Toward Its Most Informative Regime"},"references":{"count":37,"internal_anchors":7,"resolved_work":37,"sample":[{"cited_arxiv_id":"2402.03300","doi":"10.48550/arxiv.2402.03300","is_internal_anchor":true,"ref_index":1,"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","year":2024},{"cited_arxiv_id":"2503.14476","doi":"10.48550/arxiv.2503.14476","is_internal_anchor":true,"ref_index":2,"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"DeepSWE: Training a fully open-sourced, state-of-the-art coding agent by scaling RL","work_id":"6fc5b99c-abe6-4a52-94f7-53dbf6454fc5","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Let it flow: Agentic crafting on rock and roll, building the rome model within an open agentic learning ecosystem.arXiv preprint","work_id":"a1afde43-96e3-49c9-af14-25f128d65fe3","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Reuse your FLOPs: Scaling RL on hard problems by conditioning on very off-policy prefixes.CoRR, abs/2601.18795","work_id":"2be8a44f-c445-4fc6-9eec-82c26f802afc","year":2026}],"snapshot_sha256":"39f4506ac935ea9ff60270a0cd9b36327c3a953bee76b55bb3a6083483f326b6"},"source":{"id":"2605.05112","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-19T17:28:23.118864Z","id":"9325516a-2470-4889-906a-4b7aa82083c1","model_set":{"reader":"grok-4.3"},"one_line_summary":"Prefix Sampling replays self-generated trajectory prefixes to control rollout pass rates near 50% in binary-reward RL, delivering wall-clock speedups and modest performance gains on SWE-bench Verified and AIME tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Steering rollout pass rates to 50 percent strengthens binary-reward signals in agentic RL.","strongest_claim":"The binary reward-side signal is strongest near a 50% rollout pass rate under four criteria: reward entropy, group-filtering survival, leave-one-out (RLOO) advantage energy under GRPO, and success-failure pair count. Prefix Sampling steers groups toward this regime and yields 2.01x and 1.55x wall-clock speedups while matching or exceeding baseline scores.","weakest_assumption":"That replaying prefixes from prior trajectories and masking their tokens will steer pass rates to the informative regime without introducing systematic bias into the policy gradient or destabilizing GRPO optimization."}},"verdict_id":"9325516a-2470-4889-906a-4b7aa82083c1"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:5931413dcc68ec370a4fada7d9c6f7c5ada38596d415414a149b8f9889d7e7ee","target":"record","created_at":"2026-05-20T00:01:42Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"3aa61b8262eb85c21d150031d0ca3420ae03f2ea67a89971bc860db972c39f44","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-06T16:44:38Z","title_canon_sha256":"ff78394a3014356faa26a65fa7d1b9539095febbe427226953a86d4b8c1b7bf9"},"schema_version":"1.0","source":{"id":"2605.05112","kind":"arxiv","version":3}},"canonical_sha256":"64fde5e889fa46f58f9eef4d64158e4fb4e82bc2cd8165a1ca2a2fa621389270","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"64fde5e889fa46f58f9eef4d64158e4fb4e82bc2cd8165a1ca2a2fa621389270","first_computed_at":"2026-05-20T00:01:42.842444Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:01:42.842444Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"pvNDJ4Tqc3SlQBb3X1lwZ6dhXU8A1L5s5ta1SIPv6m4gdDuDNueID3goWsSX+UTyIq33xExSgfqUdXtp3ycJBw==","signature_status":"signed_v1","signed_at":"2026-05-20T00:01:42.843208Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.05112","source_kind":"arxiv","source_version":3}}},"equivocations":[{"signer_id":"pith.science","event_type":"integrity_finding","target":"integrity","event_ids":["sha256:21fc751c656e5720262086b26b09fe4f5f4548ff8e9770dcbf8a0980dfca004d","sha256:d9dba3b3523b1b19eaa0dc23de27605d95e900da21a0c364366d188bcf4fbb55"]}],"invalid_events":[],"applied_event_ids":["sha256:5931413dcc68ec370a4fada7d9c6f7c5ada38596d415414a149b8f9889d7e7ee","sha256:05c9ff4aff9d88ba2330d58a74ba3b050a1c8e43b88acb7e11634472ed17b006"],"state_sha256":"7217e83b54ab3d3e1b742fadc6213dc0d3b2c942d80dd8177f810d22d941ada3"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Ka+jJ5x8Cwe0KRhS/dkjkR+YkKECjwXyKhGlq1L07ZPZzbMlOStl3U7Mf15j+LWkRYLZh/fMGiWEwMEHzYhqCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T04:55:14.715601Z","bundle_sha256":"06d188f77a54f8d70d7f77aacba5d4bd3c290ea1a72f24be2bee376716a9383c"}}