{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:UFYTRM7B5SFJA3BM6YHAYR2ZDD","short_pith_number":"pith:UFYTRM7B","canonical_record":{"source":{"id":"2304.08244","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-04-14T14:05:32Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"ddc23efab1145fcf837291e50d35752677e16066633b2dcbaca68cdbe35390c0","abstract_canon_sha256":"2323c7ed354dd1b360749e7d13771790bd2adef54b9efe7551bc84f9e0c03dcb"},"schema_version":"1.0"},"canonical_sha256":"a17138b3e1ec8a906c2cf60e0c475918dac2d6fc59185a7cad1328b866a66d8d","source":{"kind":"arxiv","id":"2304.08244","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2304.08244","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2304.08244v2","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2304.08244","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"UFYTRM7B5SFJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"UFYTRM7B5SFJA3BM","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"UFYTRM7B","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:UFYTRM7B5SFJA3BM6YHAYR2ZDD","target":"record","payload":{"canonical_record":{"source":{"id":"2304.08244","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-04-14T14:05:32Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"ddc23efab1145fcf837291e50d35752677e16066633b2dcbaca68cdbe35390c0","abstract_canon_sha256":"2323c7ed354dd1b360749e7d13771790bd2adef54b9efe7551bc84f9e0c03dcb"},"schema_version":"1.0"},"canonical_sha256":"a17138b3e1ec8a906c2cf60e0c475918dac2d6fc59185a7cad1328b866a66d8d","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:50.202283Z","signature_b64":"VPICsmKdAuVKFqE0WjJiF6uYf/zwQ512DK4YqvAyYK2KkFkC4y5UMsVv9XJRT0uPDqX8M3ZvwHliHmuMU3vEDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a17138b3e1ec8a906c2cf60e0c475918dac2d6fc59185a7cad1328b866a66d8d","last_reissued_at":"2026-05-17T23:38:50.201794Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:50.201794Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2304.08244","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"z+ubUlLrMgcWKST7iMBUhFueY4rK++LsPj5+WKB87aPKCwvfR2dX9NIUYspTRXlBen9inV1ei1f9no82rd/NAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T14:39:31.679015Z"},"content_sha256":"16d0bb1737a7c6538f6920cd4729d5cc021a8c3b781ffdbf6281d92243fc0ad9","schema_version":"1.0","event_id":"sha256:16d0bb1737a7c6538f6920cd4729d5cc021a8c3b781ffdbf6281d92243fc0ad9"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:UFYTRM7B5SFJA3BM6YHAYR2ZDD","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"The API-Bank benchmark reveals that training Lynx on tool-use dialogues lets it surpass Alpaca by over 26 points and approach GPT-3.5 in using external APIs.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Bowen Yu, Feifan Song, Fei Huang, Haiyang Yu, Hangyu Li, Minghao Li, Yingxiu Zhao, Yongbin Li, Zhoujun Li","submitted_at":"2023-04-14T14:05:32Z","abstract_excerpt":"Recent research has demonstrated that Large Language Models (LLMs) can enhance their capabilities by utilizing external tools. However, three pivotal questions remain unanswered: (1) How effective are current LLMs in utilizing tools? (2) How can we enhance LLMs' ability to utilize tools? (3) What obstacles need to be overcome to leverage tools? To address these questions, we introduce API-Bank, a groundbreaking benchmark, specifically designed for tool-augmented LLMs. For the first question, we develop a runnable evaluation system consisting of 73 API tools. We annotate 314 tool-use dialogues "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Lynx surpasses Alpaca's tool utilization performance by more than 26 pts and approaches the effectiveness of GPT-3.5.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The 73 chosen APIs and the 314 annotated dialogues are assumed to be representative of realistic tool-use scenarios and that the automatic evaluation system correctly scores planning, retrieval, and calling accuracy.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"API-Bank is a new benchmark and training dataset for tool-augmented LLMs that shows fine-tuned models can approach GPT-3.5 tool-use effectiveness.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"The API-Bank benchmark reveals that training Lynx on tool-use dialogues lets it surpass Alpaca by over 26 points and approach GPT-3.5 in using external APIs.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"bd7ee68c8adb7a8495a26e3bdb6fc44338b2742cfc10fc1119a9fd0dc2a81f97"},"source":{"id":"2304.08244","kind":"arxiv","version":2},"verdict":{"id":"6a5b7e07-9241-434a-bf99-bc34b4984ab7","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T20:47:56.852530Z","strongest_claim":"Lynx surpasses Alpaca's tool utilization performance by more than 26 pts and approaches the effectiveness of GPT-3.5.","one_line_summary":"API-Bank is a new benchmark and training dataset for tool-augmented LLMs that shows fine-tuned models can approach GPT-3.5 tool-use effectiveness.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The 73 chosen APIs and the 314 annotated dialogues are assumed to be representative of realistic tool-use scenarios and that the automatic evaluation system correctly scores planning, retrieval, and calling accuracy.","pith_extraction_headline":"The API-Bank benchmark reveals that training Lynx on tool-use dialogues lets it surpass Alpaca by over 26 points and approach GPT-3.5 in using external APIs."},"references":{"count":23,"sample":[{"doi":"","year":1901,"title":"Advances in neural information processing systems, 33:1877–1901","work_id":"91595e80-9195-4fc9-8ee1-b418a4f1eb6c","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Sparks of Artificial General Intelligence: Early experiments with GPT-4","work_id":"a23cfe92-7f7c-424b-98d4-b386a83002fb","ref_index":2,"cited_arxiv_id":"2303.12712","is_internal_anchor":true},{"doi":"","year":null,"title":"arXiv preprint arXiv:2305.17126 , year=","work_id":"1447b78e-0a79-4af6-8cd4-93220e680d2b","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","ref_index":4,"cited_arxiv_id":"2107.03374","is_internal_anchor":true},{"doi":"","year":null,"title":"arXiv preprint arXiv:2305.11554","work_id":"569aa9d1-da86-4292-9955-f937133dafea","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":23,"snapshot_sha256":"072504b9f09893ebc5d5c2c7b0be27aedb7c2b88a3312d19a0c6f37f31f56b09","internal_anchors":13},"formal_canon":{"evidence_count":2,"snapshot_sha256":"32ed8ca8719a6b20bd913114c9c388bd0e995a09551d69ba77c8a43da3d5708c"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"6a5b7e07-9241-434a-bf99-bc34b4984ab7"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Y57/3t0xDL+rsS50hNc7/3UOEsj5mXJxY1bFl0N7pS12UVH7SQYwVGQZqyqKLFhmrKKfBwSp88ufjWdU43zVAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T14:39:31.679642Z"},"content_sha256":"b99656fcd545dfc3d3ac036009dbd4ed1608e8615a9050ded5608ed325304764","schema_version":"1.0","event_id":"sha256:b99656fcd545dfc3d3ac036009dbd4ed1608e8615a9050ded5608ed325304764"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/UFYTRM7B5SFJA3BM6YHAYR2ZDD/bundle.json","state_url":"https://pith.science/pith/UFYTRM7B5SFJA3BM6YHAYR2ZDD/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/UFYTRM7B5SFJA3BM6YHAYR2ZDD/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-10T14:39:31Z","links":{"resolver":"https://pith.science/pith/UFYTRM7B5SFJA3BM6YHAYR2ZDD","bundle":"https://pith.science/pith/UFYTRM7B5SFJA3BM6YHAYR2ZDD/bundle.json","state":"https://pith.science/pith/UFYTRM7B5SFJA3BM6YHAYR2ZDD/state.json","well_known_bundle":"https://pith.science/.well-known/pith/UFYTRM7B5SFJA3BM6YHAYR2ZDD/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:UFYTRM7B5SFJA3BM6YHAYR2ZDD","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"2323c7ed354dd1b360749e7d13771790bd2adef54b9efe7551bc84f9e0c03dcb","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-04-14T14:05:32Z","title_canon_sha256":"ddc23efab1145fcf837291e50d35752677e16066633b2dcbaca68cdbe35390c0"},"schema_version":"1.0","source":{"id":"2304.08244","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2304.08244","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2304.08244v2","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2304.08244","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"UFYTRM7B5SFJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"UFYTRM7B5SFJA3BM","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"UFYTRM7B","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:b99656fcd545dfc3d3ac036009dbd4ed1608e8615a9050ded5608ed325304764","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Lynx surpasses Alpaca's tool utilization performance by more than 26 pts and approaches the effectiveness of GPT-3.5."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The 73 chosen APIs and the 314 annotated dialogues are assumed to be representative of realistic tool-use scenarios and that the automatic evaluation system correctly scores planning, retrieval, and calling accuracy."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"API-Bank is a new benchmark and training dataset for tool-augmented LLMs that shows fine-tuned models can approach GPT-3.5 tool-use effectiveness."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"The API-Bank benchmark reveals that training Lynx on tool-use dialogues lets it surpass Alpaca by over 26 points and approach GPT-3.5 in using external APIs."}],"snapshot_sha256":"bd7ee68c8adb7a8495a26e3bdb6fc44338b2742cfc10fc1119a9fd0dc2a81f97"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"32ed8ca8719a6b20bd913114c9c388bd0e995a09551d69ba77c8a43da3d5708c"},"paper":{"abstract_excerpt":"Recent research has demonstrated that Large Language Models (LLMs) can enhance their capabilities by utilizing external tools. However, three pivotal questions remain unanswered: (1) How effective are current LLMs in utilizing tools? (2) How can we enhance LLMs' ability to utilize tools? (3) What obstacles need to be overcome to leverage tools? To address these questions, we introduce API-Bank, a groundbreaking benchmark, specifically designed for tool-augmented LLMs. For the first question, we develop a runnable evaluation system consisting of 73 API tools. We annotate 314 tool-use dialogues ","authors_text":"Bowen Yu, Feifan Song, Fei Huang, Haiyang Yu, Hangyu Li, Minghao Li, Yingxiu Zhao, Yongbin Li, Zhoujun Li","cross_cats":["cs.AI"],"headline":"The API-Bank benchmark reveals that training Lynx on tool-use dialogues lets it surpass Alpaca by over 26 points and approach GPT-3.5 in using external APIs.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-04-14T14:05:32Z","title":"API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs"},"references":{"count":23,"internal_anchors":13,"resolved_work":23,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Advances in neural information processing systems, 33:1877–1901","work_id":"91595e80-9195-4fc9-8ee1-b418a4f1eb6c","year":1901},{"cited_arxiv_id":"2303.12712","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Sparks of Artificial General Intelligence: Early experiments with GPT-4","work_id":"a23cfe92-7f7c-424b-98d4-b386a83002fb","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"arXiv preprint arXiv:2305.17126 , year=","work_id":"1447b78e-0a79-4af6-8cd4-93220e680d2b","year":null},{"cited_arxiv_id":"2107.03374","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"arXiv preprint arXiv:2305.11554","work_id":"569aa9d1-da86-4292-9955-f937133dafea","year":null}],"snapshot_sha256":"072504b9f09893ebc5d5c2c7b0be27aedb7c2b88a3312d19a0c6f37f31f56b09"},"source":{"id":"2304.08244","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T20:47:56.852530Z","id":"6a5b7e07-9241-434a-bf99-bc34b4984ab7","model_set":{"reader":"grok-4.3"},"one_line_summary":"API-Bank is a new benchmark and training dataset for tool-augmented LLMs that shows fine-tuned models can approach GPT-3.5 tool-use effectiveness.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"The API-Bank benchmark reveals that training Lynx on tool-use dialogues lets it surpass Alpaca by over 26 points and approach GPT-3.5 in using external APIs.","strongest_claim":"Lynx surpasses Alpaca's tool utilization performance by more than 26 pts and approaches the effectiveness of GPT-3.5.","weakest_assumption":"The 73 chosen APIs and the 314 annotated dialogues are assumed to be representative of realistic tool-use scenarios and that the automatic evaluation system correctly scores planning, retrieval, and calling accuracy."}},"verdict_id":"6a5b7e07-9241-434a-bf99-bc34b4984ab7"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:16d0bb1737a7c6538f6920cd4729d5cc021a8c3b781ffdbf6281d92243fc0ad9","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"2323c7ed354dd1b360749e7d13771790bd2adef54b9efe7551bc84f9e0c03dcb","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-04-14T14:05:32Z","title_canon_sha256":"ddc23efab1145fcf837291e50d35752677e16066633b2dcbaca68cdbe35390c0"},"schema_version":"1.0","source":{"id":"2304.08244","kind":"arxiv","version":2}},"canonical_sha256":"a17138b3e1ec8a906c2cf60e0c475918dac2d6fc59185a7cad1328b866a66d8d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"a17138b3e1ec8a906c2cf60e0c475918dac2d6fc59185a7cad1328b866a66d8d","first_computed_at":"2026-05-17T23:38:50.201794Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.201794Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"VPICsmKdAuVKFqE0WjJiF6uYf/zwQ512DK4YqvAyYK2KkFkC4y5UMsVv9XJRT0uPDqX8M3ZvwHliHmuMU3vEDQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.202283Z","signed_message":"canonical_sha256_bytes"},"source_id":"2304.08244","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:16d0bb1737a7c6538f6920cd4729d5cc021a8c3b781ffdbf6281d92243fc0ad9","sha256:b99656fcd545dfc3d3ac036009dbd4ed1608e8615a9050ded5608ed325304764"],"state_sha256":"0ddc9890be5202b136555b5911b647794e98fa471cb2de17e15ac9660425848a"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"jqMQ0zbPusvmNqqxPOvReBPQkHGva2IPzkohA0f1walFDdqbyqQIM+13OZ8wwMl1xgd1CpFrBNx4MXxES3v6BA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-10T14:39:31.682288Z","bundle_sha256":"0309f1ab7623c4c9bb12590cd595e8b31be9480349de988ddf39b9a0fc70f248"}}