{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:OQYRLUQULUQRTYYGU77QP2O2GC","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"b7b820a49c61ac69743992d4fbec2e012698df7afc74b65942cac2ad79c17988","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-02-13T14:58:18Z","title_canon_sha256":"0bb4176b9601178143851b705cc003b5350dd99ff73d9050e6ab1e09faf5c443"},"schema_version":"1.0","source":{"id":"2602.12984","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.12984","created_at":"2026-06-02T01:03:43Z"},{"alias_kind":"arxiv_version","alias_value":"2602.12984v2","created_at":"2026-06-02T01:03:43Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.12984","created_at":"2026-06-02T01:03:43Z"},{"alias_kind":"pith_short_12","alias_value":"OQYRLUQULUQR","created_at":"2026-06-02T01:03:43Z"},{"alias_kind":"pith_short_16","alias_value":"OQYRLUQULUQRTYYG","created_at":"2026-06-02T01:03:43Z"},{"alias_kind":"pith_short_8","alias_value":"OQYRLUQU","created_at":"2026-06-02T01:03:43Z"}],"graph_snapshots":[{"event_id":"sha256:83522e4ebd1f147a3de25fb50689d1c7566388c1519d9ea215f00dfd2853cf52","target":"graph","created_at":"2026-06-02T01:03:43Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2602.12984/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Scientific reasoning inherently demands integrating sophisticated toolkits to navigate domain-specific knowledge. Yet, current benchmarks largely overlook agents' ability to orchestrate tools for such rigorous workflows. To bridge this gap, we introduce SciAgentGym, a scalable interactive environment featuring 1,780 domain-specific tools across four natural science disciplines, supported by a robust execution infrastructure. Complementing this, we present SciAgentBench, a tiered evaluation suite designed to stress-test agentic capabilities from elementary actions to long-horizon workflows. Our","authors_text":"Binze Hu, Huayu Sha, Jiazheng Zhang, Jingqi Tong, Jixuan Huang, Junlin Shang, Lei Bai, Ming Zhang, Qiyuan Peng, Qi Zhang, Shihan Dou, Tao Gui, Xingjun Ma, Xuanjing Huang, Yajie Yang, Yu-Gang Jiang, Yujiong Shen, Yutao Fan, Zhenfei Yin, Zhiheng Xi","cross_cats":[],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-02-13T14:58:18Z","title":"SciAgentGym: Benchmarking Multi-Step Scientific Tool-use in LLM Agents"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.12984","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:397d3e280b85118d41118ebf38789c6aaa22a18101d0b45eb470342f5d057486","target":"record","created_at":"2026-06-02T01:03:43Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"b7b820a49c61ac69743992d4fbec2e012698df7afc74b65942cac2ad79c17988","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-02-13T14:58:18Z","title_canon_sha256":"0bb4176b9601178143851b705cc003b5350dd99ff73d9050e6ab1e09faf5c443"},"schema_version":"1.0","source":{"id":"2602.12984","kind":"arxiv","version":2}},"canonical_sha256":"743115d2145d2119e306a7ff07e9da3089b4ef382b96ecf5a54d301a22417c57","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"743115d2145d2119e306a7ff07e9da3089b4ef382b96ecf5a54d301a22417c57","first_computed_at":"2026-06-02T01:03:43.108980Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-02T01:03:43.108980Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"LNNkbf+DYxjKsWyq+n5AjzYBsoj7SiKytrWIDUlPO5kaHFP3xRNd6QvHu1m9MBD1c7B7xUDUJ/z/530zLjyjAA==","signature_status":"signed_v1","signed_at":"2026-06-02T01:03:43.109514Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.12984","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:397d3e280b85118d41118ebf38789c6aaa22a18101d0b45eb470342f5d057486","sha256:83522e4ebd1f147a3de25fb50689d1c7566388c1519d9ea215f00dfd2853cf52"],"state_sha256":"cf3b307ef6c2bc7f84d40b5c47c36223f95c8386cc108c8dabd42a06c25d736b"}