{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:Y4AG2MRSBBK4P3TIVF5NRCUW56","short_pith_number":"pith:Y4AG2MRS","schema_version":"1.0","canonical_sha256":"c7006d32320855c7ee68a97ad88a96efbd786a0e2fcf5b172a6708d8ebfa2b1d","source":{"kind":"arxiv","id":"2604.07649","version":4},"attestation_state":"computed","paper":{"title":"LitXBench: A Benchmark for Extracting Experiments from Scientific Literature","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Frontier language models extract full experiments from papers 0.37 F1 better than multi-turn pipelines by tying measurements to processing steps.","cross_cats":[],"primary_cat":"cs.IR","authors_text":"Curtis Chong, Jorge Colindres","submitted_at":"2026-04-08T23:31:31Z","abstract_excerpt":"Aggregating experimental data from papers enables materials scientists to build better property prediction models and to facilitate scientific discovery. Recently, interest has grown in extracting not only single material properties but also entire experimental measurements. To support this shift, we introduce LitXBench, a framework for benchmarking methods that extract experiments from literature. We also present LitXAlloy, a dense benchmark comprising 1426 total measurements from 19 alloy papers. By storing the benchmark's entries as Python objects, rather than text-based formats such as CSV"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2604.07649","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.IR","submitted_at":"2026-04-08T23:31:31Z","cross_cats_sorted":[],"title_canon_sha256":"672b0eebe7710de71e1ceb84bdc74e8939844f7a745f9525e5b398750ce87187","abstract_canon_sha256":"703fb0a62f6c6d2b4abed635a73a94ca4c3812e135768350574490889296af3e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:05:44.462277Z","signature_b64":"LJRDwgpE3ZKCpLXSMsm0QtY9oSEMVQF46L8e2Zrt62SQtTfdROn7GXXyEgGc166vc9sNGXMSKIvwTRqF3dgyDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c7006d32320855c7ee68a97ad88a96efbd786a0e2fcf5b172a6708d8ebfa2b1d","last_reissued_at":"2026-05-20T00:05:44.461720Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:05:44.461720Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"LitXBench: A Benchmark for Extracting Experiments from Scientific Literature","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Frontier language models extract full experiments from papers 0.37 F1 better than multi-turn pipelines by tying measurements to processing steps.","cross_cats":[],"primary_cat":"cs.IR","authors_text":"Curtis Chong, Jorge Colindres","submitted_at":"2026-04-08T23:31:31Z","abstract_excerpt":"Aggregating experimental data from papers enables materials scientists to build better property prediction models and to facilitate scientific discovery. Recently, interest has grown in extracting not only single material properties but also entire experimental measurements. To support this shift, we introduce LitXBench, a framework for benchmarking methods that extract experiments from literature. We also present LitXAlloy, a dense benchmark comprising 1426 total measurements from 19 alloy papers. By storing the benchmark's entries as Python objects, rather than text-based formats such as CSV"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"frontier language models, such as Gemini 3.1 Pro Preview, outperform existing multi-turn extraction pipelines by up to 0.37 F1. Our results suggest that this performance gap arises because extraction pipelines associate measurements with compositions rather than the processing steps that define a material.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the 19 alloy papers and 1426 measurements in LitXAlloy form a representative and unbiased sample of real-world extraction challenges, and that the observed F1 gap is causally due to the association with processing steps rather than other factors like prompt design or model scale.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"LitXBench shows frontier LLMs like Gemini 3.1 Pro Preview outperform extraction pipelines by 0.37 F1 because they link measurements to processing steps rather than just compositions.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Frontier language models extract full experiments from papers 0.37 F1 better than multi-turn pipelines by tying measurements to processing steps.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"d54e5dd8af232ca4b50b199bea3a9f581464f7da0233c8f639bbcf7ad152b9e4"},"source":{"id":"2604.07649","kind":"arxiv","version":4},"verdict":{"id":"dbc741e4-b3d7-4632-92c8-847ffbf5682e","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-13T00:58:20.774753Z","strongest_claim":"frontier language models, such as Gemini 3.1 Pro Preview, outperform existing multi-turn extraction pipelines by up to 0.37 F1. Our results suggest that this performance gap arises because extraction pipelines associate measurements with compositions rather than the processing steps that define a material.","one_line_summary":"LitXBench shows frontier LLMs like Gemini 3.1 Pro Preview outperform extraction pipelines by 0.37 F1 because they link measurements to processing steps rather than just compositions.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the 19 alloy papers and 1426 measurements in LitXAlloy form a representative and unbiased sample of real-world extraction challenges, and that the observed F1 gap is causally due to the association with processing steps rather than other factors like prompt design or model scale.","pith_extraction_headline":"Frontier language models extract full experiments from papers 0.37 F1 better than multi-turn pipelines by tying measurements to processing steps."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.07649/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2604.07649","created_at":"2026-05-20T00:05:44.461789+00:00"},{"alias_kind":"arxiv_version","alias_value":"2604.07649v4","created_at":"2026-05-20T00:05:44.461789+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.07649","created_at":"2026-05-20T00:05:44.461789+00:00"},{"alias_kind":"pith_short_12","alias_value":"Y4AG2MRSBBK4","created_at":"2026-05-20T00:05:44.461789+00:00"},{"alias_kind":"pith_short_16","alias_value":"Y4AG2MRSBBK4P3TI","created_at":"2026-05-20T00:05:44.461789+00:00"},{"alias_kind":"pith_short_8","alias_value":"Y4AG2MRS","created_at":"2026-05-20T00:05:44.461789+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Y4AG2MRSBBK4P3TIVF5NRCUW56","json":"https://pith.science/pith/Y4AG2MRSBBK4P3TIVF5NRCUW56.json","graph_json":"https://pith.science/api/pith-number/Y4AG2MRSBBK4P3TIVF5NRCUW56/graph.json","events_json":"https://pith.science/api/pith-number/Y4AG2MRSBBK4P3TIVF5NRCUW56/events.json","paper":"https://pith.science/paper/Y4AG2MRS"},"agent_actions":{"view_html":"https://pith.science/pith/Y4AG2MRSBBK4P3TIVF5NRCUW56","download_json":"https://pith.science/pith/Y4AG2MRSBBK4P3TIVF5NRCUW56.json","view_paper":"https://pith.science/paper/Y4AG2MRS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2604.07649&json=true","fetch_graph":"https://pith.science/api/pith-number/Y4AG2MRSBBK4P3TIVF5NRCUW56/graph.json","fetch_events":"https://pith.science/api/pith-number/Y4AG2MRSBBK4P3TIVF5NRCUW56/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Y4AG2MRSBBK4P3TIVF5NRCUW56/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Y4AG2MRSBBK4P3TIVF5NRCUW56/action/storage_attestation","attest_author":"https://pith.science/pith/Y4AG2MRSBBK4P3TIVF5NRCUW56/action/author_attestation","sign_citation":"https://pith.science/pith/Y4AG2MRSBBK4P3TIVF5NRCUW56/action/citation_signature","submit_replication":"https://pith.science/pith/Y4AG2MRSBBK4P3TIVF5NRCUW56/action/replication_record"}},"created_at":"2026-05-20T00:05:44.461789+00:00","updated_at":"2026-05-20T00:05:44.461789+00:00"}