{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:RYVQEAUDGW65ZGD55QAOJNHSIW","short_pith_number":"pith:RYVQEAUD","schema_version":"1.0","canonical_sha256":"8e2b02028335bddc987dec00e4b4f2459a73c968a690cb2f56fc3280e364f4d7","source":{"kind":"arxiv","id":"2507.19849","version":1},"attestation_state":"computed","paper":{"title":"Agentic Reinforced Policy Optimization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"ARPO improves LLM agent performance on long-horizon tasks by sampling more at high-entropy steps right after each tool call.","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Fuzheng Zhang, Guanting Dong, Guorui Zhou, Hangyu Mao, Huiyang Wang, Jiazhen Du, Ji-Rong Wen, Kai Ma, Licheng Bao, Yifei Chen, Yutao Zhu, Zhicheng Dou, Zhongxia Chen, Zhongyuan Wang","submitted_at":"2025-07-26T07:53:11Z","abstract_excerpt":"Large-scale reinforcement learning with verifiable rewards (RLVR) has demonstrated its effectiveness in harnessing the potential of large language models (LLMs) for single-turn reasoning tasks. In realistic reasoning scenarios, LLMs can often utilize external tools to assist in task-solving processes. However, current RL algorithms inadequately balance the models' intrinsic long-horizon reasoning capabilities and their proficiency in multi-turn tool interactions. To bridge this gap, we propose Agentic Reinforced Policy Optimization (ARPO), a novel agentic RL algorithm tailored for training mul"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2507.19849","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-07-26T07:53:11Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"c6efe2ebcc3ed7ebb55512d4066b4de04d544066275cf02ab776cb1a95f4a0df","abstract_canon_sha256":"2d063dcb52d9088260070a91f280b9064b4539cd1d082dfcb0de4de283df80a3"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:15.333885Z","signature_b64":"ysfb8jNGBhZqy1LdoRFEuTVGNj6ZHhBS70b9WsLDgLQLETf5x5+wOubjiUmIZQaWtEdfbdYVdXiwhsqyC8PyAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8e2b02028335bddc987dec00e4b4f2459a73c968a690cb2f56fc3280e364f4d7","last_reissued_at":"2026-05-17T23:38:15.333245Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:15.333245Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Agentic Reinforced Policy Optimization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"ARPO improves LLM agent performance on long-horizon tasks by sampling more at high-entropy steps right after each tool call.","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Fuzheng Zhang, Guanting Dong, Guorui Zhou, Hangyu Mao, Huiyang Wang, Jiazhen Du, Ji-Rong Wen, Kai Ma, Licheng Bao, Yifei Chen, Yutao Zhu, Zhicheng Dou, Zhongxia Chen, Zhongyuan Wang","submitted_at":"2025-07-26T07:53:11Z","abstract_excerpt":"Large-scale reinforcement learning with verifiable rewards (RLVR) has demonstrated its effectiveness in harnessing the potential of large language models (LLMs) for single-turn reasoning tasks. In realistic reasoning scenarios, LLMs can often utilize external tools to assist in task-solving processes. However, current RL algorithms inadequately balance the models' intrinsic long-horizon reasoning capabilities and their proficiency in multi-turn tool interactions. To bridge this gap, we propose Agentic Reinforced Policy Optimization (ARPO), a novel agentic RL algorithm tailored for training mul"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"ARPO achieves improved performance using only half of the tool-use budget required by existing methods, offering a scalable solution for aligning LLM-based agents with real-time dynamic environments.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The preliminary observation that LLMs exhibit highly uncertain behavior (increased entropy) immediately following tool interactions is general enough to guide adaptive sampling across tasks and that this mechanism reliably improves long-horizon performance.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"ARPO adds entropy-based adaptive rollouts and stepwise advantage attribution to RL for LLM agents, outperforming prior trajectory-level methods on 13 benchmarks with half the tool budget.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"ARPO improves LLM agent performance on long-horizon tasks by sampling more at high-entropy steps right after each tool call.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"7ac708c2735623518227c9c9fea38d27afe5c25896ce4e08a003e3e3bca89518"},"source":{"id":"2507.19849","kind":"arxiv","version":1},"verdict":{"id":"a88afeff-b07d-47e7-826b-849b9284cf41","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T02:53:04.888884Z","strongest_claim":"ARPO achieves improved performance using only half of the tool-use budget required by existing methods, offering a scalable solution for aligning LLM-based agents with real-time dynamic environments.","one_line_summary":"ARPO adds entropy-based adaptive rollouts and stepwise advantage attribution to RL for LLM agents, outperforming prior trajectory-level methods on 13 benchmarks with half the tool budget.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The preliminary observation that LLMs exhibit highly uncertain behavior (increased entropy) immediately following tool interactions is general enough to guide adaptive sampling across tasks and that this mechanism reliably improves long-horizon performance.","pith_extraction_headline":"ARPO improves LLM agent performance on long-horizon tasks by sampling more at high-entropy steps right after each tool call."},"references":{"count":11,"sample":[{"doi":"10.18653/v1/2020.coling-main.580","year":2020,"title":"REINFORCE++: Stabilizing Critic-Free Policy Optimization with Global Advantage Normalization","work_id":"557f9e99-cb00-4dd2-92fd-67ddcddbb35d","ref_index":1,"cited_arxiv_id":"2501.03262","is_internal_anchor":true},{"doi":"10.18653/v1/2023.findings-emnlp.378","year":2023,"title":"Prabha, D., Aswini, J., Maheswari, B., Subramanian, R","work_id":"d690fac4-0cde-42d6-958a-77a77c0e7bd0","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.48550/arxiv","year":2025,"title":"Scaling Relationship on Learning Mathematical Reasoning with Large Language Models","work_id":"15eea2e2-dff3-42a7-842a-b663d50f64cb","ref_index":5,"cited_arxiv_id":"2308.01825","is_internal_anchor":true},{"doi":"","year":2024,"title":"thinking while doing","work_id":"a538c3a8-43ab-4c98-9bce-23c1952d7dc1","ref_index":6,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Each interaction response length is capped at 4096 tokens","work_id":"7f546fee-7358-4734-a82f-45417904c092","ref_index":7,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":11,"snapshot_sha256":"9e5ed862b21c0d38e4ad20f60781ea547fee361314ab61e7fd1611873909a002","internal_anchors":2},"formal_canon":{"evidence_count":2,"snapshot_sha256":"57118c3a0ee8588e8a444efaf89d75ec04ca7b43baec44d67609ee1f2b70d331"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2507.19849","created_at":"2026-05-17T23:38:15.333379+00:00"},{"alias_kind":"arxiv_version","alias_value":"2507.19849v1","created_at":"2026-05-17T23:38:15.333379+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2507.19849","created_at":"2026-05-17T23:38:15.333379+00:00"},{"alias_kind":"pith_short_12","alias_value":"RYVQEAUDGW65","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"RYVQEAUDGW65ZGD5","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"RYVQEAUD","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":26,"internal_anchor_count":26,"sample":[{"citing_arxiv_id":"2603.28767","citing_title":"Gen-Searcher: Reinforcing Agentic Search for Image Generation","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21801","citing_title":"Why Semantic Entropy Fails: Geometry-Aware and Calibrated Uncertainty for Policy Optimization","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18768","citing_title":"ClinQueryAgent: A Conversational Agent for Population Health Management","ref_index":110,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14133","citing_title":"ClawForge: Generating Executable Interactive Benchmarks for Command-Line Agents","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15706","citing_title":"Differentiable Mixture-of-Agents Incentivizes Swarm Intelligence of Large Language Models","ref_index":97,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17295","citing_title":"DISA: Offline Importance Sampling for Distribution-Matching LLM-RL","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2509.08827","citing_title":"A Survey of Reinforcement Learning for Large Reasoning Models","ref_index":114,"is_internal_anchor":true},{"citing_arxiv_id":"2512.03043","citing_title":"OneThinker: All-in-one Reasoning Model for Image and Video","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2601.08605","citing_title":"ExpSeek: Self-Triggered Experience Seeking for Web Agents","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11853","citing_title":"GEAR: Granularity-Adaptive Advantage Reweighting for LLM Agents via Self-Distillation","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14133","citing_title":"ClawForge: Generating Executable Interactive Benchmarks for Command-Line Agents","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2603.28767","citing_title":"Gen-Searcher: Reinforcing Agentic Search for Image Generation","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11853","citing_title":"GEAR: Granularity-Adaptive Advantage Reweighting for LLM Agents via Self-Distillation","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12070","citing_title":"Missing Old Logits in Asynchronous Agentic RL: Semantic Mismatch and Repair Methods for Off-Policy Correction","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12004","citing_title":"Learning Agentic Policy from Action Guidance","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06326","citing_title":"Teaching Thinking Models to Reason with Tools: A Full-Pipeline Recipe for Tool-Integrated Reasoning","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06200","citing_title":"A$^2$TGPO: Agentic Turn-Group Policy Optimization with Adaptive Turn-level Clipping","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00365","citing_title":"Uniform-Correct Policy Optimization: Breaking RLVR's Indifference to Diversity","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2604.17884","citing_title":"SPREG: Structured Plan Repair with Entropy-Guided Test-Time Intervention for Large Language Model Reasoning","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08232","citing_title":"HiRO-Nav: Hybrid ReasOning Enables Efficient Embodied Navigation","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08539","citing_title":"OpenVLThinkerV2: A Generalist Multimodal Reasoning Model for Multi-domain Visual Tasks","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06804","citing_title":"LASER: A Data-Centric Method for Low-Cost and Efficient SQL Rewriting based on SQL-GRPO","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2604.09455","citing_title":"E3-TIR: Enhanced Experience Exploitation for Tool-Integrated Reasoning","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13602","citing_title":"Reward Hacking in the Era of Large Models: Mechanisms, Emergent Misalignment, Challenges","ref_index":208,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14518","citing_title":"Mind DeepResearch Technical Report","ref_index":6,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/RYVQEAUDGW65ZGD55QAOJNHSIW","json":"https://pith.science/pith/RYVQEAUDGW65ZGD55QAOJNHSIW.json","graph_json":"https://pith.science/api/pith-number/RYVQEAUDGW65ZGD55QAOJNHSIW/graph.json","events_json":"https://pith.science/api/pith-number/RYVQEAUDGW65ZGD55QAOJNHSIW/events.json","paper":"https://pith.science/paper/RYVQEAUD"},"agent_actions":{"view_html":"https://pith.science/pith/RYVQEAUDGW65ZGD55QAOJNHSIW","download_json":"https://pith.science/pith/RYVQEAUDGW65ZGD55QAOJNHSIW.json","view_paper":"https://pith.science/paper/RYVQEAUD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2507.19849&json=true","fetch_graph":"https://pith.science/api/pith-number/RYVQEAUDGW65ZGD55QAOJNHSIW/graph.json","fetch_events":"https://pith.science/api/pith-number/RYVQEAUDGW65ZGD55QAOJNHSIW/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/RYVQEAUDGW65ZGD55QAOJNHSIW/action/timestamp_anchor","attest_storage":"https://pith.science/pith/RYVQEAUDGW65ZGD55QAOJNHSIW/action/storage_attestation","attest_author":"https://pith.science/pith/RYVQEAUDGW65ZGD55QAOJNHSIW/action/author_attestation","sign_citation":"https://pith.science/pith/RYVQEAUDGW65ZGD55QAOJNHSIW/action/citation_signature","submit_replication":"https://pith.science/pith/RYVQEAUDGW65ZGD55QAOJNHSIW/action/replication_record"}},"created_at":"2026-05-17T23:38:15.333379+00:00","updated_at":"2026-05-17T23:38:15.333379+00:00"}