{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:FW7WKZDGCBC63F5U7E7H3CTPPD","short_pith_number":"pith:FW7WKZDG","schema_version":"1.0","canonical_sha256":"2dbf6564661045ed97b4f93e7d8a6f78f5fd116f2e1268a831df32dc4971d742","source":{"kind":"arxiv","id":"2412.21154","version":1},"attestation_state":"computed","paper":{"title":"Aviary: training language agents on challenging scientific tasks","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":["cs.CL","cs.LG"],"primary_cat":"cs.AI","authors_text":"Albert Bou, Andrew D. White, Geemi Wellawatte, James D. Braza, Jon Laurent, Manu Ponnapati, Ori Kabeli, Ryan-Rhys Griffiths, Sam Cox, Samuel G. Rodriques, Siddharth Narayanan","submitted_at":"2024-12-30T18:33:28Z","abstract_excerpt":"Solving complex real-world tasks requires cycles of actions and observations. This is particularly true in science, where tasks require many cycles of analysis, tool use, and experimentation. Language agents are promising for automating intellectual tasks in science because they can interact with tools via natural language or code. Yet their flexibility creates conceptual and practical challenges for software implementations, since agents may comprise non-standard components such as internal reasoning, planning, tool usage, as well as the inherent stochasticity of temperature-sampled language "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2412.21154","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2024-12-30T18:33:28Z","cross_cats_sorted":["cs.CL","cs.LG"],"title_canon_sha256":"460b5cdacbf082a8573884eee3c6d244c511a9d3b0b2ea17e7432c47ef2cb873","abstract_canon_sha256":"e26e009f1d53d7b5cdf1c5e6b7380de715c3a8905544ecf095e8a01e24178763"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T09:55:28.108573Z","signature_b64":"hYIMXUNvBcICyRxaI3WDQnjizT8WS2xrc8HSk2vwanAjOKfjZXqkumQH4mpBpDEEg/4pbLuYNlCe55y1mMk/Dw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2dbf6564661045ed97b4f93e7d8a6f78f5fd116f2e1268a831df32dc4971d742","last_reissued_at":"2026-07-05T09:55:28.108030Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T09:55:28.108030Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Aviary: training language agents on challenging scientific tasks","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":["cs.CL","cs.LG"],"primary_cat":"cs.AI","authors_text":"Albert Bou, Andrew D. White, Geemi Wellawatte, James D. Braza, Jon Laurent, Manu Ponnapati, Ori Kabeli, Ryan-Rhys Griffiths, Sam Cox, Samuel G. Rodriques, Siddharth Narayanan","submitted_at":"2024-12-30T18:33:28Z","abstract_excerpt":"Solving complex real-world tasks requires cycles of actions and observations. This is particularly true in science, where tasks require many cycles of analysis, tool use, and experimentation. Language agents are promising for automating intellectual tasks in science because they can interact with tools via natural language or code. Yet their flexibility creates conceptual and practical challenges for software implementations, since agents may comprise non-standard components such as internal reasoning, planning, tool usage, as well as the inherent stochasticity of temperature-sampled language "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2412.21154","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2412.21154/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2412.21154","created_at":"2026-07-05T09:55:28.108096+00:00"},{"alias_kind":"arxiv_version","alias_value":"2412.21154v1","created_at":"2026-07-05T09:55:28.108096+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.21154","created_at":"2026-07-05T09:55:28.108096+00:00"},{"alias_kind":"pith_short_12","alias_value":"FW7WKZDGCBC6","created_at":"2026-07-05T09:55:28.108096+00:00"},{"alias_kind":"pith_short_16","alias_value":"FW7WKZDGCBC63F5U","created_at":"2026-07-05T09:55:28.108096+00:00"},{"alias_kind":"pith_short_8","alias_value":"FW7WKZDG","created_at":"2026-07-05T09:55:28.108096+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":9,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2606.09774","citing_title":"Auto-Configuring Scientific Simulators with Lightweight Coding-Agent Adapters","ref_index":21,"is_internal_anchor":false},{"citing_arxiv_id":"2605.07022","citing_title":"Self-Driving Datasets: From 20 Million Papers to Nuanced Biomedical Knowledge at Scale","ref_index":48,"is_internal_anchor":false},{"citing_arxiv_id":"2606.09774","citing_title":"Auto-Configuring Scientific Simulators with Lightweight Coding-Agent Adapters","ref_index":18,"is_internal_anchor":false},{"citing_arxiv_id":"2605.07022","citing_title":"Self-Driving Datasets: From 20 Million Papers to Nuanced Biomedical Knowledge at Scale","ref_index":48,"is_internal_anchor":false},{"citing_arxiv_id":"2506.22653","citing_title":"URSA: The Universal Research and Scientific Agent","ref_index":13,"is_internal_anchor":false},{"citing_arxiv_id":"2509.20374","citing_title":"CFDLLMBench: A Benchmark Suite for Evaluating Large Language Models in Computational Fluid Dynamics","ref_index":38,"is_internal_anchor":false},{"citing_arxiv_id":"2604.09554","citing_title":"LABBench2: An Improved Benchmark for AI Systems Performing Biology Research","ref_index":19,"is_internal_anchor":false},{"citing_arxiv_id":"2604.09554","citing_title":"LABBench2: An Improved Benchmark for AI Systems Performing Biology Research","ref_index":20,"is_internal_anchor":false},{"citing_arxiv_id":"2605.07022","citing_title":"Self-Driving Datasets: From 20 Million Papers to Nuanced Biomedical Knowledge at Scale","ref_index":49,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/FW7WKZDGCBC63F5U7E7H3CTPPD","json":"https://pith.science/pith/FW7WKZDGCBC63F5U7E7H3CTPPD.json","graph_json":"https://pith.science/api/pith-number/FW7WKZDGCBC63F5U7E7H3CTPPD/graph.json","events_json":"https://pith.science/api/pith-number/FW7WKZDGCBC63F5U7E7H3CTPPD/events.json","paper":"https://pith.science/paper/FW7WKZDG"},"agent_actions":{"view_html":"https://pith.science/pith/FW7WKZDGCBC63F5U7E7H3CTPPD","download_json":"https://pith.science/pith/FW7WKZDGCBC63F5U7E7H3CTPPD.json","view_paper":"https://pith.science/paper/FW7WKZDG","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2412.21154&json=true","fetch_graph":"https://pith.science/api/pith-number/FW7WKZDGCBC63F5U7E7H3CTPPD/graph.json","fetch_events":"https://pith.science/api/pith-number/FW7WKZDGCBC63F5U7E7H3CTPPD/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/FW7WKZDGCBC63F5U7E7H3CTPPD/action/timestamp_anchor","attest_storage":"https://pith.science/pith/FW7WKZDGCBC63F5U7E7H3CTPPD/action/storage_attestation","attest_author":"https://pith.science/pith/FW7WKZDGCBC63F5U7E7H3CTPPD/action/author_attestation","sign_citation":"https://pith.science/pith/FW7WKZDGCBC63F5U7E7H3CTPPD/action/citation_signature","submit_replication":"https://pith.science/pith/FW7WKZDGCBC63F5U7E7H3CTPPD/action/replication_record"}},"created_at":"2026-07-05T09:55:28.108096+00:00","updated_at":"2026-07-05T09:55:28.108096+00:00"}