{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:KP27Q3ABXMLX3LNJ3TWELYJXRX","short_pith_number":"pith:KP27Q3AB","schema_version":"1.0","canonical_sha256":"53f5f86c01bb177dada9dcec45e1378de362b405ad91ff72dcacc95fad9bf5b5","source":{"kind":"arxiv","id":"2510.18821","version":3},"attestation_state":"computed","paper":{"title":"Search Self-play: Pushing the Frontier of Agent Capability without Supervision","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Chutian Wang, Guanjun Jiang, Haonan Chen, Haotian Xu, Hongliang Lu, Jiaqi Guo, Pengyu Cheng, Ruijin Ding, Xiaoxi Jiang, Yuhang Wen","submitted_at":"2025-10-21T17:19:35Z","abstract_excerpt":"Reinforcement learning with verifiable rewards (RLVR) has become the mainstream technique for training LLM agents. However, RLVR highly depends on well-crafted task queries and corresponding ground-truth answers to provide accurate rewards, which requires significant human effort and hinders the scaling of RL processes, especially in agentic scenarios. Although a few recent works explore task synthesis methods, the difficulty of generated agentic tasks can hardly be controlled to provide effective RL training advantages. To achieve agentic RLVR with higher scalability, we explore self-play tra"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2510.18821","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-10-21T17:19:35Z","cross_cats_sorted":[],"title_canon_sha256":"9282ea2f430c6b96a214184d2ef18fbc26763433544fed61bba5c3bbb00256f7","abstract_canon_sha256":"552c7109c6c1f1126f04898b536d0ec1de1b2a27a2a880ae7755fd536004197d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T01:05:00.498346Z","signature_b64":"bobeYIHybjnDFuCbWMIJOSLmmwtVHPDxZJ1/EOrLscyD1I8LYlHYKhUg28LArnk8WAbJyq0HKTeTZyShkGnCAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"53f5f86c01bb177dada9dcec45e1378de362b405ad91ff72dcacc95fad9bf5b5","last_reissued_at":"2026-05-20T01:05:00.497560Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T01:05:00.497560Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Search Self-play: Pushing the Frontier of Agent Capability without Supervision","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Chutian Wang, Guanjun Jiang, Haonan Chen, Haotian Xu, Hongliang Lu, Jiaqi Guo, Pengyu Cheng, Ruijin Ding, Xiaoxi Jiang, Yuhang Wen","submitted_at":"2025-10-21T17:19:35Z","abstract_excerpt":"Reinforcement learning with verifiable rewards (RLVR) has become the mainstream technique for training LLM agents. However, RLVR highly depends on well-crafted task queries and corresponding ground-truth answers to provide accurate rewards, which requires significant human effort and hinders the scaling of RL processes, especially in agentic scenarios. Although a few recent works explore task synthesis methods, the difficulty of generated agentic tasks can hardly be controlled to provide effective RL training advantages. To achieve agentic RLVR with higher scalability, we explore self-play tra"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.18821","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2510.18821/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2510.18821","created_at":"2026-05-20T01:05:00.497673+00:00"},{"alias_kind":"arxiv_version","alias_value":"2510.18821v3","created_at":"2026-05-20T01:05:00.497673+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.18821","created_at":"2026-05-20T01:05:00.497673+00:00"},{"alias_kind":"pith_short_12","alias_value":"KP27Q3ABXMLX","created_at":"2026-05-20T01:05:00.497673+00:00"},{"alias_kind":"pith_short_16","alias_value":"KP27Q3ABXMLX3LNJ","created_at":"2026-05-20T01:05:00.497673+00:00"},{"alias_kind":"pith_short_8","alias_value":"KP27Q3AB","created_at":"2026-05-20T01:05:00.497673+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"2605.17721","citing_title":"EXG: Self-Evolving Agents with Experience Graphs","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14054","citing_title":"$\\pi$-Play: Multi-Agent Self-Play via Privileged Self-Distillation without External Data","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2604.20148","citing_title":"Meta-Tool: Efficient Few-Shot Tool Adaptation for Small Language Models","ref_index":41,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/KP27Q3ABXMLX3LNJ3TWELYJXRX","json":"https://pith.science/pith/KP27Q3ABXMLX3LNJ3TWELYJXRX.json","graph_json":"https://pith.science/api/pith-number/KP27Q3ABXMLX3LNJ3TWELYJXRX/graph.json","events_json":"https://pith.science/api/pith-number/KP27Q3ABXMLX3LNJ3TWELYJXRX/events.json","paper":"https://pith.science/paper/KP27Q3AB"},"agent_actions":{"view_html":"https://pith.science/pith/KP27Q3ABXMLX3LNJ3TWELYJXRX","download_json":"https://pith.science/pith/KP27Q3ABXMLX3LNJ3TWELYJXRX.json","view_paper":"https://pith.science/paper/KP27Q3AB","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2510.18821&json=true","fetch_graph":"https://pith.science/api/pith-number/KP27Q3ABXMLX3LNJ3TWELYJXRX/graph.json","fetch_events":"https://pith.science/api/pith-number/KP27Q3ABXMLX3LNJ3TWELYJXRX/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/KP27Q3ABXMLX3LNJ3TWELYJXRX/action/timestamp_anchor","attest_storage":"https://pith.science/pith/KP27Q3ABXMLX3LNJ3TWELYJXRX/action/storage_attestation","attest_author":"https://pith.science/pith/KP27Q3ABXMLX3LNJ3TWELYJXRX/action/author_attestation","sign_citation":"https://pith.science/pith/KP27Q3ABXMLX3LNJ3TWELYJXRX/action/citation_signature","submit_replication":"https://pith.science/pith/KP27Q3ABXMLX3LNJ3TWELYJXRX/action/replication_record"}},"created_at":"2026-05-20T01:05:00.497673+00:00","updated_at":"2026-05-20T01:05:00.497673+00:00"}