{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:KFK7XDN6PDEPL4QJVBXL6DJIUP","short_pith_number":"pith:KFK7XDN6","schema_version":"1.0","canonical_sha256":"5155fb8dbe78c8f5f209a86ebf0d28a3f45b3d3bbfff215c84f23156d4902a3d","source":{"kind":"arxiv","id":"2606.22388","version":1},"attestation_state":"computed","paper":{"title":"PlanBench-XL: Evaluating Long-Horizon Planning of LLM Tool-Use Agents in Large-Scale Tool Ecosystems","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.AI","authors_text":"Cheng Qian, Dilek Hakkani-T\\\"ur, Emre Can Acikgoz, Heng Ji, Jiateng Liu, Jiayu Liu, Qihan Lin, Rui Wang, Xiaocheng Yang, Xiusi Chen, Zhenhailong Wang","submitted_at":"2026-06-21T08:29:12Z","abstract_excerpt":"LLM agents increasingly operate in large tool ecosystems, where real-world tasks require discovering relevant tools, inferring implicit sub-goals, and adapting to dynamic environments over long horizons. However, existing benchmarks rarely evaluate planning under retrieval-limited tool visibility. To address this gap, we introduce PlanBench-XL, an interactive benchmark of 327 retail tasks over 1,665 tools that tests whether agents can iteratively retrieve usable tools, invoke them to uncover intermediate evidence for subsequent calls toward the final goal. PlanBench-XL further features an opti"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.22388","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-06-21T08:29:12Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"3337832a6944cba11c58c48b5fd17c95bf6d8a1724350fef10f8cbc3cf9bc0fa","abstract_canon_sha256":"e9535fc16b802b983cdc6d2900a6faa3b0447569591c0856401201504150c102"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T02:13:37.126549Z","signature_b64":"bBqZsdVHx+rbbhbYba3OeB19RqCkfzxEwuRhF6QMlm78C9Ucg6kJSsihzGDqUdPqTYJ7ws2o3gLcLXwSv/3LDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5155fb8dbe78c8f5f209a86ebf0d28a3f45b3d3bbfff215c84f23156d4902a3d","last_reissued_at":"2026-06-23T02:13:37.126117Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T02:13:37.126117Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"PlanBench-XL: Evaluating Long-Horizon Planning of LLM Tool-Use Agents in Large-Scale Tool Ecosystems","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.AI","authors_text":"Cheng Qian, Dilek Hakkani-T\\\"ur, Emre Can Acikgoz, Heng Ji, Jiateng Liu, Jiayu Liu, Qihan Lin, Rui Wang, Xiaocheng Yang, Xiusi Chen, Zhenhailong Wang","submitted_at":"2026-06-21T08:29:12Z","abstract_excerpt":"LLM agents increasingly operate in large tool ecosystems, where real-world tasks require discovering relevant tools, inferring implicit sub-goals, and adapting to dynamic environments over long horizons. However, existing benchmarks rarely evaluate planning under retrieval-limited tool visibility. To address this gap, we introduce PlanBench-XL, an interactive benchmark of 327 retail tasks over 1,665 tools that tests whether agents can iteratively retrieve usable tools, invoke them to uncover intermediate evidence for subsequent calls toward the final goal. PlanBench-XL further features an opti"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.22388","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.22388/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.22388","created_at":"2026-06-23T02:13:37.126180+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.22388v1","created_at":"2026-06-23T02:13:37.126180+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.22388","created_at":"2026-06-23T02:13:37.126180+00:00"},{"alias_kind":"pith_short_12","alias_value":"KFK7XDN6PDEP","created_at":"2026-06-23T02:13:37.126180+00:00"},{"alias_kind":"pith_short_16","alias_value":"KFK7XDN6PDEPL4QJ","created_at":"2026-06-23T02:13:37.126180+00:00"},{"alias_kind":"pith_short_8","alias_value":"KFK7XDN6","created_at":"2026-06-23T02:13:37.126180+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/KFK7XDN6PDEPL4QJVBXL6DJIUP","json":"https://pith.science/pith/KFK7XDN6PDEPL4QJVBXL6DJIUP.json","graph_json":"https://pith.science/api/pith-number/KFK7XDN6PDEPL4QJVBXL6DJIUP/graph.json","events_json":"https://pith.science/api/pith-number/KFK7XDN6PDEPL4QJVBXL6DJIUP/events.json","paper":"https://pith.science/paper/KFK7XDN6"},"agent_actions":{"view_html":"https://pith.science/pith/KFK7XDN6PDEPL4QJVBXL6DJIUP","download_json":"https://pith.science/pith/KFK7XDN6PDEPL4QJVBXL6DJIUP.json","view_paper":"https://pith.science/paper/KFK7XDN6","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.22388&json=true","fetch_graph":"https://pith.science/api/pith-number/KFK7XDN6PDEPL4QJVBXL6DJIUP/graph.json","fetch_events":"https://pith.science/api/pith-number/KFK7XDN6PDEPL4QJVBXL6DJIUP/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/KFK7XDN6PDEPL4QJVBXL6DJIUP/action/timestamp_anchor","attest_storage":"https://pith.science/pith/KFK7XDN6PDEPL4QJVBXL6DJIUP/action/storage_attestation","attest_author":"https://pith.science/pith/KFK7XDN6PDEPL4QJVBXL6DJIUP/action/author_attestation","sign_citation":"https://pith.science/pith/KFK7XDN6PDEPL4QJVBXL6DJIUP/action/citation_signature","submit_replication":"https://pith.science/pith/KFK7XDN6PDEPL4QJVBXL6DJIUP/action/replication_record"}},"created_at":"2026-06-23T02:13:37.126180+00:00","updated_at":"2026-06-23T02:13:37.126180+00:00"}