{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:JQV66JLA35IBZFBTFZ3NRYLFR7","short_pith_number":"pith:JQV66JLA","canonical_record":{"source":{"id":"2407.10362","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2024-07-14T23:52:25Z","cross_cats_sorted":[],"title_canon_sha256":"e1e688186ac8a564ee4148b596d36e6270602308f20fb0f5c063dad5750372a3","abstract_canon_sha256":"93987a7bf6ec82cff30bd36782bcb0930d5cc6ddbca93afde4947e0547ac096e"},"schema_version":"1.0"},"canonical_sha256":"4c2bef2560df501c94332e76d8e1658fe77d63b751508f4118a5d4e623f63c80","source":{"kind":"arxiv","id":"2407.10362","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2407.10362","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2407.10362v3","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2407.10362","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"JQV66JLA35IB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"JQV66JLA35IBZFBT","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"JQV66JLA","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:JQV66JLA35IBZFBTFZ3NRYLFR7","target":"record","payload":{"canonical_record":{"source":{"id":"2407.10362","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2024-07-14T23:52:25Z","cross_cats_sorted":[],"title_canon_sha256":"e1e688186ac8a564ee4148b596d36e6270602308f20fb0f5c063dad5750372a3","abstract_canon_sha256":"93987a7bf6ec82cff30bd36782bcb0930d5cc6ddbca93afde4947e0547ac096e"},"schema_version":"1.0"},"canonical_sha256":"4c2bef2560df501c94332e76d8e1658fe77d63b751508f4118a5d4e623f63c80","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:47.379680Z","signature_b64":"vSStX4E5lseUKogt/ljCrX7Cmo4lgO1e38iggLBFRrRia494dbrWWoOPwXI+q+n4uIZ1H7eA8HdQ0+lV1iEPCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4c2bef2560df501c94332e76d8e1658fe77d63b751508f4118a5d4e623f63c80","last_reissued_at":"2026-05-17T23:38:47.379162Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:47.379162Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2407.10362","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"imKZkK02wuJ3xJ5KTX0SsuIz3fkJ1raYOOIuRonZF2LJ4XSDkFeChgFGhHs8Q9Z1YYG4P3kxAr/vmDbVBAfMBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T03:39:13.682578Z"},"content_sha256":"f79d5cd2d883681f9a993cdef9d8f8c55880f886c47dee1789e5396687b2af06","schema_version":"1.0","event_id":"sha256:f79d5cd2d883681f9a993cdef9d8f8c55880f886c47dee1789e5396687b2af06"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:JQV66JLA35IBZFBTFZ3NRYLFR7","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"LAB-Bench: Measuring Capabilities of Language Models for Biology Research","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"LAB-Bench introduces over 2,400 questions to test AI on practical biology research tasks such as literature search and sequence manipulation.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Andrew D. White, Jon M. Laurent, Joseph D. Janizek, Manvitha Ponnapati, Michaela M. Hinks, Michael J. Hammerling, Michael Ruzo, Samuel G. Rodriques, Siddharth Narayanan","submitted_at":"2024-07-14T23:52:25Z","abstract_excerpt":"There is widespread optimism that frontier Large Language Models (LLMs) and LLM-augmented systems have the potential to rapidly accelerate scientific discovery across disciplines. Today, many benchmarks exist to measure LLM knowledge and reasoning on textbook-style science questions, but few if any benchmarks are designed to evaluate language model performance on practical tasks required for scientific research, such as literature search, protocol planning, and data analysis. As a step toward building such benchmarks, we introduce the Language Agent Biology Benchmark (LAB-Bench), a broad datas"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"An AI system that can achieve consistently high scores on the more difficult LAB-Bench tasks would serve as a useful assistant for researchers in areas such as literature search and molecular cloning.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The multiple-choice questions in LAB-Bench accurately reflect the practical capabilities required for real-world biology research tasks, rather than testing only surface-level pattern matching.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"LAB-Bench provides over 2,400 multiple-choice questions to measure LLM performance on real biology research tasks like literature recall, figure reading, database access, and sequence manipulation, with initial results compared against human expert biologists.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"LAB-Bench introduces over 2,400 questions to test AI on practical biology research tasks such as literature search and sequence manipulation.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"299834ed872332ccc83d21cf388a00309e84b2662a7a614255cf124078ab9fa0"},"source":{"id":"2407.10362","kind":"arxiv","version":3},"verdict":{"id":"e0fc9f87-3847-4836-8602-697bc90dfc2d","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T15:54:22.695332Z","strongest_claim":"An AI system that can achieve consistently high scores on the more difficult LAB-Bench tasks would serve as a useful assistant for researchers in areas such as literature search and molecular cloning.","one_line_summary":"LAB-Bench provides over 2,400 multiple-choice questions to measure LLM performance on real biology research tasks like literature recall, figure reading, database access, and sequence manipulation, with initial results compared against human expert biologists.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The multiple-choice questions in LAB-Bench accurately reflect the practical capabilities required for real-world biology research tasks, rather than testing only surface-level pattern matching.","pith_extraction_headline":"LAB-Bench introduces over 2,400 questions to test AI on practical biology research tasks such as literature search and sequence manipulation."},"references":{"count":59,"sample":[{"doi":"","year":2015,"title":"Joanna S Amberger, Carol A Bocchini, François Schiettecatte, Alan F Scott, and Ada Hamosh. Omim. org: Online mendelian inheritance in man (omim®), an online catalog of human genes and genetic disorder","work_id":"d9644263-9bba-42f7-96b2-73411b093b8f","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Introducing the next generation of claude, March 2024","work_id":"089ab74c-c20b-448c-b7e1-b1e3eeb8ea71","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Introducing the next generation of claude, March 2024","work_id":"45581889-1f17-4b18-a18d-dec141414b05","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Lessons from the Trenches on Reproducible Evaluation of Language Models","work_id":"47b597a2-a355-4305-b1e4-80666b394ccd","ref_index":4,"cited_arxiv_id":"2405.14782","is_internal_anchor":true},{"doi":"10.1038/s41586-023-06792-0","year":2023,"title":"Autonomous chemical research with large language models","work_id":"e15cebd6-c137-47c6-975e-41b70ed20de9","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":59,"snapshot_sha256":"8165fd4d148049a6ebd835e6a2358a84e0f26c5931754e99119fb4b53c67db66","internal_anchors":3},"formal_canon":{"evidence_count":3,"snapshot_sha256":"64d42f3f44977608c9c52a1aa465f4896288a6370c0800e0b3abf6b2293516f9"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"e0fc9f87-3847-4836-8602-697bc90dfc2d"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"EeO0ZR18lTm99nVNo3sk8MN3ejR4Nhk0NU59oySHiWHyMTwK3C8jsBICMzNxz+aiuVH5D7CgLDKrmQVlLhrPCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T03:39:13.683240Z"},"content_sha256":"cd2de610ea2f80e3158b73e752e08e083997da23a82755c2df0b01c0df0a72b0","schema_version":"1.0","event_id":"sha256:cd2de610ea2f80e3158b73e752e08e083997da23a82755c2df0b01c0df0a72b0"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/JQV66JLA35IBZFBTFZ3NRYLFR7/bundle.json","state_url":"https://pith.science/pith/JQV66JLA35IBZFBTFZ3NRYLFR7/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/JQV66JLA35IBZFBTFZ3NRYLFR7/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T03:39:13Z","links":{"resolver":"https://pith.science/pith/JQV66JLA35IBZFBTFZ3NRYLFR7","bundle":"https://pith.science/pith/JQV66JLA35IBZFBTFZ3NRYLFR7/bundle.json","state":"https://pith.science/pith/JQV66JLA35IBZFBTFZ3NRYLFR7/state.json","well_known_bundle":"https://pith.science/.well-known/pith/JQV66JLA35IBZFBTFZ3NRYLFR7/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:JQV66JLA35IBZFBTFZ3NRYLFR7","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"93987a7bf6ec82cff30bd36782bcb0930d5cc6ddbca93afde4947e0547ac096e","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2024-07-14T23:52:25Z","title_canon_sha256":"e1e688186ac8a564ee4148b596d36e6270602308f20fb0f5c063dad5750372a3"},"schema_version":"1.0","source":{"id":"2407.10362","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2407.10362","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2407.10362v3","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2407.10362","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"JQV66JLA35IB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"JQV66JLA35IBZFBT","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"JQV66JLA","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:cd2de610ea2f80e3158b73e752e08e083997da23a82755c2df0b01c0df0a72b0","target":"graph","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"An AI system that can achieve consistently high scores on the more difficult LAB-Bench tasks would serve as a useful assistant for researchers in areas such as literature search and molecular cloning."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The multiple-choice questions in LAB-Bench accurately reflect the practical capabilities required for real-world biology research tasks, rather than testing only surface-level pattern matching."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"LAB-Bench provides over 2,400 multiple-choice questions to measure LLM performance on real biology research tasks like literature recall, figure reading, database access, and sequence manipulation, with initial results compared against human expert biologists."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"LAB-Bench introduces over 2,400 questions to test AI on practical biology research tasks such as literature search and sequence manipulation."}],"snapshot_sha256":"299834ed872332ccc83d21cf388a00309e84b2662a7a614255cf124078ab9fa0"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"64d42f3f44977608c9c52a1aa465f4896288a6370c0800e0b3abf6b2293516f9"},"paper":{"abstract_excerpt":"There is widespread optimism that frontier Large Language Models (LLMs) and LLM-augmented systems have the potential to rapidly accelerate scientific discovery across disciplines. Today, many benchmarks exist to measure LLM knowledge and reasoning on textbook-style science questions, but few if any benchmarks are designed to evaluate language model performance on practical tasks required for scientific research, such as literature search, protocol planning, and data analysis. As a step toward building such benchmarks, we introduce the Language Agent Biology Benchmark (LAB-Bench), a broad datas","authors_text":"Andrew D. White, Jon M. Laurent, Joseph D. Janizek, Manvitha Ponnapati, Michaela M. Hinks, Michael J. Hammerling, Michael Ruzo, Samuel G. Rodriques, Siddharth Narayanan","cross_cats":[],"headline":"LAB-Bench introduces over 2,400 questions to test AI on practical biology research tasks such as literature search and sequence manipulation.","license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2024-07-14T23:52:25Z","title":"LAB-Bench: Measuring Capabilities of Language Models for Biology Research"},"references":{"count":59,"internal_anchors":3,"resolved_work":59,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Joanna S Amberger, Carol A Bocchini, François Schiettecatte, Alan F Scott, and Ada Hamosh. Omim. org: Online mendelian inheritance in man (omim®), an online catalog of human genes and genetic disorder","work_id":"d9644263-9bba-42f7-96b2-73411b093b8f","year":2015},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Introducing the next generation of claude, March 2024","work_id":"089ab74c-c20b-448c-b7e1-b1e3eeb8ea71","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Introducing the next generation of claude, March 2024","work_id":"45581889-1f17-4b18-a18d-dec141414b05","year":2024},{"cited_arxiv_id":"2405.14782","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Lessons from the Trenches on Reproducible Evaluation of Language Models","work_id":"47b597a2-a355-4305-b1e4-80666b394ccd","year":2024},{"cited_arxiv_id":"","doi":"10.1038/s41586-023-06792-0","is_internal_anchor":false,"ref_index":5,"title":"Autonomous chemical research with large language models","work_id":"e15cebd6-c137-47c6-975e-41b70ed20de9","year":2023}],"snapshot_sha256":"8165fd4d148049a6ebd835e6a2358a84e0f26c5931754e99119fb4b53c67db66"},"source":{"id":"2407.10362","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-16T15:54:22.695332Z","id":"e0fc9f87-3847-4836-8602-697bc90dfc2d","model_set":{"reader":"grok-4.3"},"one_line_summary":"LAB-Bench provides over 2,400 multiple-choice questions to measure LLM performance on real biology research tasks like literature recall, figure reading, database access, and sequence manipulation, with initial results compared against human expert biologists.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"LAB-Bench introduces over 2,400 questions to test AI on practical biology research tasks such as literature search and sequence manipulation.","strongest_claim":"An AI system that can achieve consistently high scores on the more difficult LAB-Bench tasks would serve as a useful assistant for researchers in areas such as literature search and molecular cloning.","weakest_assumption":"The multiple-choice questions in LAB-Bench accurately reflect the practical capabilities required for real-world biology research tasks, rather than testing only surface-level pattern matching."}},"verdict_id":"e0fc9f87-3847-4836-8602-697bc90dfc2d"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f79d5cd2d883681f9a993cdef9d8f8c55880f886c47dee1789e5396687b2af06","target":"record","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"93987a7bf6ec82cff30bd36782bcb0930d5cc6ddbca93afde4947e0547ac096e","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2024-07-14T23:52:25Z","title_canon_sha256":"e1e688186ac8a564ee4148b596d36e6270602308f20fb0f5c063dad5750372a3"},"schema_version":"1.0","source":{"id":"2407.10362","kind":"arxiv","version":3}},"canonical_sha256":"4c2bef2560df501c94332e76d8e1658fe77d63b751508f4118a5d4e623f63c80","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"4c2bef2560df501c94332e76d8e1658fe77d63b751508f4118a5d4e623f63c80","first_computed_at":"2026-05-17T23:38:47.379162Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:47.379162Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"vSStX4E5lseUKogt/ljCrX7Cmo4lgO1e38iggLBFRrRia494dbrWWoOPwXI+q+n4uIZ1H7eA8HdQ0+lV1iEPCw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:47.379680Z","signed_message":"canonical_sha256_bytes"},"source_id":"2407.10362","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f79d5cd2d883681f9a993cdef9d8f8c55880f886c47dee1789e5396687b2af06","sha256:cd2de610ea2f80e3158b73e752e08e083997da23a82755c2df0b01c0df0a72b0"],"state_sha256":"e074e127280fa6b1f76dd77e6c475c86ca9503278ef5bdc3b5a818ab4dc6792e"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"nDZt8XJK/msSFW9nF08/pB0xa9k90qDoxWk/nxHJsTMtob71jmz7dlsmujMVWodZ90cX5OJGouNNVzi2PwiPBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T03:39:13.687244Z","bundle_sha256":"4c35164d347900a2a43448e86002fed8c75379c23d0bf4a217aa69e39cb10f96"}}