{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:EYMHDPD5YM3H6XWTQ4LIE3APIW","short_pith_number":"pith:EYMHDPD5","canonical_record":{"source":{"id":"2504.13958","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-04-16T21:45:32Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"a6a3d8cbe619dc8a2acd102e7ff2545163a89ac48f1be9b07f337af429f6db69","abstract_canon_sha256":"554a6c4040adfff956401c0dcf839c06f1adf4c031130927d473224b6450fda5"},"schema_version":"1.0"},"canonical_sha256":"261871bc7dc3367f5ed38716826c0f459c73573a005c87b75c51d4dcf1edc70c","source":{"kind":"arxiv","id":"2504.13958","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2504.13958","created_at":"2026-05-18T03:22:05Z"},{"alias_kind":"arxiv_version","alias_value":"2504.13958v1","created_at":"2026-05-18T03:22:05Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.13958","created_at":"2026-05-18T03:22:05Z"},{"alias_kind":"pith_short_12","alias_value":"EYMHDPD5YM3H","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"EYMHDPD5YM3H6XWT","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"EYMHDPD5","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:EYMHDPD5YM3H6XWTQ4LIE3APIW","target":"record","payload":{"canonical_record":{"source":{"id":"2504.13958","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-04-16T21:45:32Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"a6a3d8cbe619dc8a2acd102e7ff2545163a89ac48f1be9b07f337af429f6db69","abstract_canon_sha256":"554a6c4040adfff956401c0dcf839c06f1adf4c031130927d473224b6450fda5"},"schema_version":"1.0"},"canonical_sha256":"261871bc7dc3367f5ed38716826c0f459c73573a005c87b75c51d4dcf1edc70c","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:22:05.943746Z","signature_b64":"SmZpG9mh8k5OaQrbinTWKTXOzXXfuGXTvHomzUkDu8kOldur5NF+0uJqYxOkPI/invvMXGBTVzB0vHsdRDx7Cw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"261871bc7dc3367f5ed38716826c0f459c73573a005c87b75c51d4dcf1edc70c","last_reissued_at":"2026-05-18T03:22:05.942883Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:22:05.942883Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2504.13958","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:22:05Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Qhr5Bp3hDyNveF7KxvLWeILtTsjWH4K8Y7rnxKUZSxaC80fbgw/R9wP/elUx5Oumu+5niroSvukPVkYEpuBjBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-20T09:27:35.865357Z"},"content_sha256":"5737557dda424bbb088e6f621567d7d050868be832ab14bca02a5062b48bb3cd","schema_version":"1.0","event_id":"sha256:5737557dda424bbb088e6f621567d7d050868be832ab14bca02a5062b48bb3cd"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:EYMHDPD5YM3H6XWTQ4LIE3APIW","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"ToolRL: Reward is All Tool Learning Needs","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A principled reward design for tool-use tasks lets reinforcement learning outperform supervised fine-tuning in training LLMs to use tools.","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Cheng Qian, Dilek Hakkani-T\\\"ur, Emre Can Acikgoz, Gokhan Tur, Heng Ji, Hongru Wang, Qi He, Xiusi Chen","submitted_at":"2025-04-16T21:45:32Z","abstract_excerpt":"Current Large Language Models (LLMs) often undergo supervised fine-tuning (SFT) to acquire tool use capabilities. However, SFT struggles to generalize to unfamiliar or complex tool use scenarios. Recent advancements in reinforcement learning (RL), particularly with R1-like models, have demonstrated promising reasoning and generalization abilities. Yet, reward design for tool use presents unique challenges: multiple tools may be invoked with diverse parameters, and coarse-grained reward signals, such as answer matching, fail to offer the finegrained feedback required for effective learning. In "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Empirical evaluations across diverse benchmarks demonstrate that our approach yields robust, scalable, and stable training, achieving a 17% improvement over base models and a 15% gain over SFT models.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The explored reward strategies and the proposed principled design are assumed to transfer to tool-use scenarios outside the specific benchmarks and tool sets used in the experiments.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"A principled reward design for tool selection and application in RL-trained LLMs delivers 17% gains over base models and 15% over SFT across benchmarks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A principled reward design for tool-use tasks lets reinforcement learning outperform supervised fine-tuning in training LLMs to use tools.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"e7806655713c5806b083448c7e35d7fcabdbc7ab0734f85664d0c75665d8e2ee"},"source":{"id":"2504.13958","kind":"arxiv","version":1},"verdict":{"id":"e2d1cb57-4334-4172-9e1b-d0df4bfd74db","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T00:21:51.946869Z","strongest_claim":"Empirical evaluations across diverse benchmarks demonstrate that our approach yields robust, scalable, and stable training, achieving a 17% improvement over base models and a 15% gain over SFT models.","one_line_summary":"A principled reward design for tool selection and application in RL-trained LLMs delivers 17% gains over base models and 15% over SFT across benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The explored reward strategies and the proposed principled design are assumed to transfer to tool-use scenarios outside the specific benchmarks and tool sets used in the experiments.","pith_extraction_headline":"A principled reward design for tool-use tasks lets reinforcement learning outperform supervised fine-tuning in training LLMs to use tools."},"references":{"count":46,"sample":[{"doi":"","year":null,"title":"Can a single model master both multi-turn conversations and tool use? coalm: A uni- fied conversational agentic language model. Preprint, arXiv:2502.08820. Jinheon Baek, Sujay Kumar Jauhar, Silviu Cuc","work_id":"0fc5f988-a294-4233-99e6-0d734965f4b5","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Researchagent: Iterative research idea generation over scientific literature with large language models,","work_id":"41213a8f-51aa-4065-b3d5-2f154966db88","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Program of Thoughts Prompting: Disentangling Computation from Reasoning for Numerical Reasoning Tasks","work_id":"618aa44c-a6c6-425c-abce-8aa8aa842921","ref_index":3,"cited_arxiv_id":"2211.12588","is_internal_anchor":true},{"doi":"","year":2024,"title":"In Findings of the Association for Compu- tational Linguistics: ACL 2024 , pages 9354–9366, Bangkok, Thailand","work_id":"90cd51e7-3c1c-451d-a021-7a7d089d473b","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-training","work_id":"258dd934-025c-47f5-b4f6-5a0c1c338cc6","ref_index":5,"cited_arxiv_id":"2501.17161","is_internal_anchor":true}],"resolved_work":46,"snapshot_sha256":"b24efdc154cb9fd05b118265ae3687bb9f4eabdcbb50524828d2ae6b46f82a53","internal_anchors":19},"formal_canon":{"evidence_count":2,"snapshot_sha256":"102fb83dfcb9d006b2485fa91c8a330fbcf79fa368aa5600b6839a1d96fbcc89"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"e2d1cb57-4334-4172-9e1b-d0df4bfd74db"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:22:05Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"AfKcXoCh4ov6qazdLDukeTpNVgBOz+ulPIdQubX4C3F5J0AdsGJlLfGV6LOiuwQrMSz996BxoZL2tDyFC4/7Ag==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-20T09:27:35.866000Z"},"content_sha256":"cec5034e049d71b800ff140dc255b5e0bfd1dc30db5a38041285348813658991","schema_version":"1.0","event_id":"sha256:cec5034e049d71b800ff140dc255b5e0bfd1dc30db5a38041285348813658991"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/EYMHDPD5YM3H6XWTQ4LIE3APIW/bundle.json","state_url":"https://pith.science/pith/EYMHDPD5YM3H6XWTQ4LIE3APIW/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/EYMHDPD5YM3H6XWTQ4LIE3APIW/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-20T09:27:35Z","links":{"resolver":"https://pith.science/pith/EYMHDPD5YM3H6XWTQ4LIE3APIW","bundle":"https://pith.science/pith/EYMHDPD5YM3H6XWTQ4LIE3APIW/bundle.json","state":"https://pith.science/pith/EYMHDPD5YM3H6XWTQ4LIE3APIW/state.json","well_known_bundle":"https://pith.science/.well-known/pith/EYMHDPD5YM3H6XWTQ4LIE3APIW/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:EYMHDPD5YM3H6XWTQ4LIE3APIW","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"554a6c4040adfff956401c0dcf839c06f1adf4c031130927d473224b6450fda5","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-04-16T21:45:32Z","title_canon_sha256":"a6a3d8cbe619dc8a2acd102e7ff2545163a89ac48f1be9b07f337af429f6db69"},"schema_version":"1.0","source":{"id":"2504.13958","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2504.13958","created_at":"2026-05-18T03:22:05Z"},{"alias_kind":"arxiv_version","alias_value":"2504.13958v1","created_at":"2026-05-18T03:22:05Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.13958","created_at":"2026-05-18T03:22:05Z"},{"alias_kind":"pith_short_12","alias_value":"EYMHDPD5YM3H","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"EYMHDPD5YM3H6XWT","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"EYMHDPD5","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:cec5034e049d71b800ff140dc255b5e0bfd1dc30db5a38041285348813658991","target":"graph","created_at":"2026-05-18T03:22:05Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Empirical evaluations across diverse benchmarks demonstrate that our approach yields robust, scalable, and stable training, achieving a 17% improvement over base models and a 15% gain over SFT models."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The explored reward strategies and the proposed principled design are assumed to transfer to tool-use scenarios outside the specific benchmarks and tool sets used in the experiments."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"A principled reward design for tool selection and application in RL-trained LLMs delivers 17% gains over base models and 15% over SFT across benchmarks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A principled reward design for tool-use tasks lets reinforcement learning outperform supervised fine-tuning in training LLMs to use tools."}],"snapshot_sha256":"e7806655713c5806b083448c7e35d7fcabdbc7ab0734f85664d0c75665d8e2ee"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"102fb83dfcb9d006b2485fa91c8a330fbcf79fa368aa5600b6839a1d96fbcc89"},"paper":{"abstract_excerpt":"Current Large Language Models (LLMs) often undergo supervised fine-tuning (SFT) to acquire tool use capabilities. However, SFT struggles to generalize to unfamiliar or complex tool use scenarios. Recent advancements in reinforcement learning (RL), particularly with R1-like models, have demonstrated promising reasoning and generalization abilities. Yet, reward design for tool use presents unique challenges: multiple tools may be invoked with diverse parameters, and coarse-grained reward signals, such as answer matching, fail to offer the finegrained feedback required for effective learning. In ","authors_text":"Cheng Qian, Dilek Hakkani-T\\\"ur, Emre Can Acikgoz, Gokhan Tur, Heng Ji, Hongru Wang, Qi He, Xiusi Chen","cross_cats":["cs.AI","cs.CL"],"headline":"A principled reward design for tool-use tasks lets reinforcement learning outperform supervised fine-tuning in training LLMs to use tools.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-04-16T21:45:32Z","title":"ToolRL: Reward is All Tool Learning Needs"},"references":{"count":46,"internal_anchors":19,"resolved_work":46,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Can a single model master both multi-turn conversations and tool use? coalm: A uni- fied conversational agentic language model. Preprint, arXiv:2502.08820. Jinheon Baek, Sujay Kumar Jauhar, Silviu Cuc","work_id":"0fc5f988-a294-4233-99e6-0d734965f4b5","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Researchagent: Iterative research idea generation over scientific literature with large language models,","work_id":"41213a8f-51aa-4065-b3d5-2f154966db88","year":null},{"cited_arxiv_id":"2211.12588","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Program of Thoughts Prompting: Disentangling Computation from Reasoning for Numerical Reasoning Tasks","work_id":"618aa44c-a6c6-425c-abce-8aa8aa842921","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"In Findings of the Association for Compu- tational Linguistics: ACL 2024 , pages 9354–9366, Bangkok, Thailand","work_id":"90cd51e7-3c1c-451d-a021-7a7d089d473b","year":2024},{"cited_arxiv_id":"2501.17161","doi":"","is_internal_anchor":true,"ref_index":5,"title":"SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-training","work_id":"258dd934-025c-47f5-b4f6-5a0c1c338cc6","year":null}],"snapshot_sha256":"b24efdc154cb9fd05b118265ae3687bb9f4eabdcbb50524828d2ae6b46f82a53"},"source":{"id":"2504.13958","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T00:21:51.946869Z","id":"e2d1cb57-4334-4172-9e1b-d0df4bfd74db","model_set":{"reader":"grok-4.3"},"one_line_summary":"A principled reward design for tool selection and application in RL-trained LLMs delivers 17% gains over base models and 15% over SFT across benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A principled reward design for tool-use tasks lets reinforcement learning outperform supervised fine-tuning in training LLMs to use tools.","strongest_claim":"Empirical evaluations across diverse benchmarks demonstrate that our approach yields robust, scalable, and stable training, achieving a 17% improvement over base models and a 15% gain over SFT models.","weakest_assumption":"The explored reward strategies and the proposed principled design are assumed to transfer to tool-use scenarios outside the specific benchmarks and tool sets used in the experiments."}},"verdict_id":"e2d1cb57-4334-4172-9e1b-d0df4bfd74db"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:5737557dda424bbb088e6f621567d7d050868be832ab14bca02a5062b48bb3cd","target":"record","created_at":"2026-05-18T03:22:05Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"554a6c4040adfff956401c0dcf839c06f1adf4c031130927d473224b6450fda5","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-04-16T21:45:32Z","title_canon_sha256":"a6a3d8cbe619dc8a2acd102e7ff2545163a89ac48f1be9b07f337af429f6db69"},"schema_version":"1.0","source":{"id":"2504.13958","kind":"arxiv","version":1}},"canonical_sha256":"261871bc7dc3367f5ed38716826c0f459c73573a005c87b75c51d4dcf1edc70c","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"261871bc7dc3367f5ed38716826c0f459c73573a005c87b75c51d4dcf1edc70c","first_computed_at":"2026-05-18T03:22:05.942883Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:22:05.942883Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"SmZpG9mh8k5OaQrbinTWKTXOzXXfuGXTvHomzUkDu8kOldur5NF+0uJqYxOkPI/invvMXGBTVzB0vHsdRDx7Cw==","signature_status":"signed_v1","signed_at":"2026-05-18T03:22:05.943746Z","signed_message":"canonical_sha256_bytes"},"source_id":"2504.13958","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:5737557dda424bbb088e6f621567d7d050868be832ab14bca02a5062b48bb3cd","sha256:cec5034e049d71b800ff140dc255b5e0bfd1dc30db5a38041285348813658991"],"state_sha256":"9fbdf6579c37aca1342bab7ca379f31a3762279b518c4669e5b11ac6df7a9a08"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"6ZbhzFGk4dTq0jBGcmOd2VxSWprrEq7dSsSRGY4uOJn6IX/4huTce7fV1jPQdgWaK5yRs4O1MxkPNrw+YrzzDw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-20T09:27:35.868805Z","bundle_sha256":"9e38bc4bea73ff662a852a2b0e94d78c5ac9f3520d76ca0e137d7752656a13b7"}}