{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:R3RNULAH2ZVUS6HRXSXFA6DB3J","short_pith_number":"pith:R3RNULAH","schema_version":"1.0","canonical_sha256":"8ee2da2c07d66b4978f1bcae507861da7987fa4c3d8ed1049fc335be02e65869","source":{"kind":"arxiv","id":"2510.06672","version":3},"attestation_state":"computed","paper":{"title":"XRPO: Pushing the limits of GRPO with Targeted Exploration and Exploitation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Fan Lai, Haizhong Zheng, Minghao Fang, Udbhav Bamba, Yifan Yu","submitted_at":"2025-10-08T05:53:56Z","abstract_excerpt":"Reinforcement learning algorithms such as GRPO have driven recent advances in large language model (LLM) reasoning. While scaling the number of rollouts stabilizes training, existing approaches suffer from limited exploration on challenging prompts and leave informative feedback signals underexploited, due to context-independent rollout allocation across prompts (e.g., generating 16 rollouts per prompt) and relying heavily on sparse rewards. This paper presents XRPO(eXplore - eXploit GRPO), a unified framework that recasts policy optimization through the principled lens of rollout exploration-"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2510.06672","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-08T05:53:56Z","cross_cats_sorted":[],"title_canon_sha256":"aeb3a1f7c27971c892cbf037a77f4e4dd6d2492567454a4a5247814f9a2ee246","abstract_canon_sha256":"4a3ce8dd7bafb378c22e4a4c3dc72707405b15ba991f244f7d43ef7631ecb6ae"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-26T02:03:57.262790Z","signature_b64":"nMXp3ZcMnR1f/PqgWUYnNbuaygVOIjVQHQ0lvaU30pId70sSWQ+3bhzSdzjBuGwOUcv9BLrekglH5PWXDZv9Dg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8ee2da2c07d66b4978f1bcae507861da7987fa4c3d8ed1049fc335be02e65869","last_reissued_at":"2026-05-26T02:03:57.261916Z","signature_status":"signed_v1","first_computed_at":"2026-05-26T02:03:57.261916Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"XRPO: Pushing the limits of GRPO with Targeted Exploration and Exploitation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Fan Lai, Haizhong Zheng, Minghao Fang, Udbhav Bamba, Yifan Yu","submitted_at":"2025-10-08T05:53:56Z","abstract_excerpt":"Reinforcement learning algorithms such as GRPO have driven recent advances in large language model (LLM) reasoning. While scaling the number of rollouts stabilizes training, existing approaches suffer from limited exploration on challenging prompts and leave informative feedback signals underexploited, due to context-independent rollout allocation across prompts (e.g., generating 16 rollouts per prompt) and relying heavily on sparse rewards. This paper presents XRPO(eXplore - eXploit GRPO), a unified framework that recasts policy optimization through the principled lens of rollout exploration-"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.06672","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2510.06672/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2510.06672","created_at":"2026-05-26T02:03:57.262046+00:00"},{"alias_kind":"arxiv_version","alias_value":"2510.06672v3","created_at":"2026-05-26T02:03:57.262046+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.06672","created_at":"2026-05-26T02:03:57.262046+00:00"},{"alias_kind":"pith_short_12","alias_value":"R3RNULAH2ZVU","created_at":"2026-05-26T02:03:57.262046+00:00"},{"alias_kind":"pith_short_16","alias_value":"R3RNULAH2ZVUS6HR","created_at":"2026-05-26T02:03:57.262046+00:00"},{"alias_kind":"pith_short_8","alias_value":"R3RNULAH","created_at":"2026-05-26T02:03:57.262046+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2602.06239","citing_title":"Provably avoiding over-optimization in Direct Preference Optimization without knowing the data distribution","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2602.06239","citing_title":"Provably avoiding over-optimization in Direct Preference Optimization without knowing the data distribution","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2603.15646","citing_title":"Alternating Reinforcement Learning with Contextual Rubric Rewards: Beyond the Scalarization Strategy","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05262","citing_title":"Maximizing Rollout Informativeness under a Fixed Budget: A Submodular View of Tree Search for Tool-Use Agentic Reinforcement Learning","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06804","citing_title":"LASER: A Data-Centric Method for Low-Cost and Efficient SQL Rewriting based on SQL-GRPO","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.02913","citing_title":"Generate, Filter, Control, Replay: A Comprehensive Survey of Rollout Strategies for LLM Reinforcement Learning","ref_index":3,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J","json":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J.json","graph_json":"https://pith.science/api/pith-number/R3RNULAH2ZVUS6HRXSXFA6DB3J/graph.json","events_json":"https://pith.science/api/pith-number/R3RNULAH2ZVUS6HRXSXFA6DB3J/events.json","paper":"https://pith.science/paper/R3RNULAH"},"agent_actions":{"view_html":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J","download_json":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J.json","view_paper":"https://pith.science/paper/R3RNULAH","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2510.06672&json=true","fetch_graph":"https://pith.science/api/pith-number/R3RNULAH2ZVUS6HRXSXFA6DB3J/graph.json","fetch_events":"https://pith.science/api/pith-number/R3RNULAH2ZVUS6HRXSXFA6DB3J/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J/action/timestamp_anchor","attest_storage":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J/action/storage_attestation","attest_author":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J/action/author_attestation","sign_citation":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J/action/citation_signature","submit_replication":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J/action/replication_record"}},"created_at":"2026-05-26T02:03:57.262046+00:00","updated_at":"2026-05-26T02:03:57.262046+00:00"}