{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:R3RNULAH2ZVUS6HRXSXFA6DB3J","short_pith_number":"pith:R3RNULAH","canonical_record":{"source":{"id":"2510.06672","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-08T05:53:56Z","cross_cats_sorted":[],"title_canon_sha256":"aeb3a1f7c27971c892cbf037a77f4e4dd6d2492567454a4a5247814f9a2ee246","abstract_canon_sha256":"4a3ce8dd7bafb378c22e4a4c3dc72707405b15ba991f244f7d43ef7631ecb6ae"},"schema_version":"1.0"},"canonical_sha256":"8ee2da2c07d66b4978f1bcae507861da7987fa4c3d8ed1049fc335be02e65869","source":{"kind":"arxiv","id":"2510.06672","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2510.06672","created_at":"2026-05-26T02:03:57Z"},{"alias_kind":"arxiv_version","alias_value":"2510.06672v3","created_at":"2026-05-26T02:03:57Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.06672","created_at":"2026-05-26T02:03:57Z"},{"alias_kind":"pith_short_12","alias_value":"R3RNULAH2ZVU","created_at":"2026-05-26T02:03:57Z"},{"alias_kind":"pith_short_16","alias_value":"R3RNULAH2ZVUS6HR","created_at":"2026-05-26T02:03:57Z"},{"alias_kind":"pith_short_8","alias_value":"R3RNULAH","created_at":"2026-05-26T02:03:57Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:R3RNULAH2ZVUS6HRXSXFA6DB3J","target":"record","payload":{"canonical_record":{"source":{"id":"2510.06672","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-08T05:53:56Z","cross_cats_sorted":[],"title_canon_sha256":"aeb3a1f7c27971c892cbf037a77f4e4dd6d2492567454a4a5247814f9a2ee246","abstract_canon_sha256":"4a3ce8dd7bafb378c22e4a4c3dc72707405b15ba991f244f7d43ef7631ecb6ae"},"schema_version":"1.0"},"canonical_sha256":"8ee2da2c07d66b4978f1bcae507861da7987fa4c3d8ed1049fc335be02e65869","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-26T02:03:57.262790Z","signature_b64":"nMXp3ZcMnR1f/PqgWUYnNbuaygVOIjVQHQ0lvaU30pId70sSWQ+3bhzSdzjBuGwOUcv9BLrekglH5PWXDZv9Dg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8ee2da2c07d66b4978f1bcae507861da7987fa4c3d8ed1049fc335be02e65869","last_reissued_at":"2026-05-26T02:03:57.261916Z","signature_status":"signed_v1","first_computed_at":"2026-05-26T02:03:57.261916Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2510.06672","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-26T02:03:57Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"zOVKD2gf7jSqvbiGgT0k8B5YmY03Y3HaCCZhzDUSPK2qiOk+zrSKpKDEeuLHIWCb28hdPCbTn23rfEQv+YmODQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T19:55:46.193132Z"},"content_sha256":"22dab65e347ebdb10f4db4f76308a2fcbe2b95e057b6551a337b225e18b38983","schema_version":"1.0","event_id":"sha256:22dab65e347ebdb10f4db4f76308a2fcbe2b95e057b6551a337b225e18b38983"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:R3RNULAH2ZVUS6HRXSXFA6DB3J","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"XRPO: Pushing the limits of GRPO with Targeted Exploration and Exploitation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Fan Lai, Haizhong Zheng, Minghao Fang, Udbhav Bamba, Yifan Yu","submitted_at":"2025-10-08T05:53:56Z","abstract_excerpt":"Reinforcement learning algorithms such as GRPO have driven recent advances in large language model (LLM) reasoning. While scaling the number of rollouts stabilizes training, existing approaches suffer from limited exploration on challenging prompts and leave informative feedback signals underexploited, due to context-independent rollout allocation across prompts (e.g., generating 16 rollouts per prompt) and relying heavily on sparse rewards. This paper presents XRPO(eXplore - eXploit GRPO), a unified framework that recasts policy optimization through the principled lens of rollout exploration-"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.06672","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2510.06672/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-26T02:03:57Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"fj+OuvzVpERIXVaj8M+ynXeAUpP3q0HGRRajrhdR444QxsKQc1SsrIVljvp8fRgLYq0N/BbqXFudRY4+R0lHDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T19:55:46.193617Z"},"content_sha256":"6b0b6b8a13b890236612293d352044426c157be5c832e7280524c13177284672","schema_version":"1.0","event_id":"sha256:6b0b6b8a13b890236612293d352044426c157be5c832e7280524c13177284672"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J/bundle.json","state_url":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T19:55:46Z","links":{"resolver":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J","bundle":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J/bundle.json","state":"https://pith.science/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J/state.json","well_known_bundle":"https://pith.science/.well-known/pith/R3RNULAH2ZVUS6HRXSXFA6DB3J/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:R3RNULAH2ZVUS6HRXSXFA6DB3J","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"4a3ce8dd7bafb378c22e4a4c3dc72707405b15ba991f244f7d43ef7631ecb6ae","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-08T05:53:56Z","title_canon_sha256":"aeb3a1f7c27971c892cbf037a77f4e4dd6d2492567454a4a5247814f9a2ee246"},"schema_version":"1.0","source":{"id":"2510.06672","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2510.06672","created_at":"2026-05-26T02:03:57Z"},{"alias_kind":"arxiv_version","alias_value":"2510.06672v3","created_at":"2026-05-26T02:03:57Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.06672","created_at":"2026-05-26T02:03:57Z"},{"alias_kind":"pith_short_12","alias_value":"R3RNULAH2ZVU","created_at":"2026-05-26T02:03:57Z"},{"alias_kind":"pith_short_16","alias_value":"R3RNULAH2ZVUS6HR","created_at":"2026-05-26T02:03:57Z"},{"alias_kind":"pith_short_8","alias_value":"R3RNULAH","created_at":"2026-05-26T02:03:57Z"}],"graph_snapshots":[{"event_id":"sha256:6b0b6b8a13b890236612293d352044426c157be5c832e7280524c13177284672","target":"graph","created_at":"2026-05-26T02:03:57Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2510.06672/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Reinforcement learning algorithms such as GRPO have driven recent advances in large language model (LLM) reasoning. While scaling the number of rollouts stabilizes training, existing approaches suffer from limited exploration on challenging prompts and leave informative feedback signals underexploited, due to context-independent rollout allocation across prompts (e.g., generating 16 rollouts per prompt) and relying heavily on sparse rewards. This paper presents XRPO(eXplore - eXploit GRPO), a unified framework that recasts policy optimization through the principled lens of rollout exploration-","authors_text":"Fan Lai, Haizhong Zheng, Minghao Fang, Udbhav Bamba, Yifan Yu","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-08T05:53:56Z","title":"XRPO: Pushing the limits of GRPO with Targeted Exploration and Exploitation"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.06672","kind":"arxiv","version":3},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:22dab65e347ebdb10f4db4f76308a2fcbe2b95e057b6551a337b225e18b38983","target":"record","created_at":"2026-05-26T02:03:57Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"4a3ce8dd7bafb378c22e4a4c3dc72707405b15ba991f244f7d43ef7631ecb6ae","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-08T05:53:56Z","title_canon_sha256":"aeb3a1f7c27971c892cbf037a77f4e4dd6d2492567454a4a5247814f9a2ee246"},"schema_version":"1.0","source":{"id":"2510.06672","kind":"arxiv","version":3}},"canonical_sha256":"8ee2da2c07d66b4978f1bcae507861da7987fa4c3d8ed1049fc335be02e65869","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"8ee2da2c07d66b4978f1bcae507861da7987fa4c3d8ed1049fc335be02e65869","first_computed_at":"2026-05-26T02:03:57.261916Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-26T02:03:57.261916Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"nMXp3ZcMnR1f/PqgWUYnNbuaygVOIjVQHQ0lvaU30pId70sSWQ+3bhzSdzjBuGwOUcv9BLrekglH5PWXDZv9Dg==","signature_status":"signed_v1","signed_at":"2026-05-26T02:03:57.262790Z","signed_message":"canonical_sha256_bytes"},"source_id":"2510.06672","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:22dab65e347ebdb10f4db4f76308a2fcbe2b95e057b6551a337b225e18b38983","sha256:6b0b6b8a13b890236612293d352044426c157be5c832e7280524c13177284672"],"state_sha256":"6b6a4837209b9765a9c94a1f97436dfffd862966923f7e9ce11da9fa7395d41c"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"bI0L8ebGCVLkvim+hIuDJbMDdGs6Sp5M3BfFOdfDGN2b5+PwgvhJDe6dYto6jzBSDLX60rN1eGZCsISUGUDBDA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T19:55:46.196656Z","bundle_sha256":"766395f4eaeb389bb4d25d717be2695373eb779cbd57b56f7bc6567bcb395533"}}