{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:C7FS43ZY7FEVE25KVLRGUPIPEH","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"b5e40b9361de399324381bcffd9d049ec056cb96913b881de6ae19dac5ccbeab","cross_cats_sorted":["cs.AI","cs.CL","eess.AS"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SD","submitted_at":"2025-07-10T19:40:21Z","title_canon_sha256":"c6c0fe3e84f24dffea9d926559483dc8b4aae1ad8ed0b973ecc08b5e7237e806"},"schema_version":"1.0","source":{"id":"2507.08128","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2507.08128","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2507.08128v2","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2507.08128","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"C7FS43ZY7FEV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"C7FS43ZY7FEVE25K","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"C7FS43ZY","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:1c46905a98a51e9f5eb28d932ca7284a7c10889fbf2ed50bc46ce23e60bb2402","target":"graph","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"AF3 achieves new SOTA results on over 20+ (long) audio understanding and reasoning benchmarks, surpassing both open-weight and closed-source models trained on much larger datasets."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the newly introduced datasets and five-stage curriculum produce genuine generalization rather than benchmark-specific gains, and that all comparisons use identical evaluation protocols without undisclosed advantages."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Audio Flamingo 3 introduces an open large audio-language model achieving new state-of-the-art results on over 20 audio understanding and reasoning benchmarks using a unified encoder and curriculum training on open data."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Audio Flamingo 3 is a fully open large audio-language model that sets new state-of-the-art results on over twenty audio understanding and reasoning benchmarks using only open-source data."}],"snapshot_sha256":"61739c1547a299dafec5f569bdc620cb33833c7a962f0e396ae7603607c47617"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"c802c9fa3d64325deb6b1497a258471f351502a086ee9eecbcfd3e496bff1d3b"},"paper":{"abstract_excerpt":"We present Audio Flamingo 3 (AF3), a fully open state-of-the-art (SOTA) large audio-language model that advances reasoning and understanding across speech, sound, and music. AF3 introduces: (i) AF-Whisper, a unified audio encoder trained using a novel strategy for joint representation learning across all 3 modalities of speech, sound, and music; (ii) flexible, on-demand thinking, allowing the model to do chain-of-thought-type reasoning before answering; (iii) multi-turn, multi-audio chat; (iv) long audio understanding and reasoning (including speech) up to 10 minutes; and (v) voice-to-voice in","authors_text":"Arushi Goel, Bryan Catanzaro, Chao-Han Huck Yang, Dinesh Manocha, Jaehyeon Kim, Rafael Valle, Ramani Duraiswami, Sang-gil Lee, Sonal Kumar, Sreyan Ghosh, Zhifeng Kong","cross_cats":["cs.AI","cs.CL","eess.AS"],"headline":"Audio Flamingo 3 is a fully open large audio-language model that sets new state-of-the-art results on over twenty audio understanding and reasoning benchmarks using only open-source data.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SD","submitted_at":"2025-07-10T19:40:21Z","title":"Audio Flamingo 3: Advancing Audio Intelligence with Fully Open Large Audio Language Models"},"references":{"count":208,"internal_anchors":21,"resolved_work":208,"sample":[{"cited_arxiv_id":"2503.01743","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Phi-4-Mini Technical Report: Compact yet Powerful Multimodal Language Models via Mixture-of-LoRAs","work_id":"83956045-536a-41ff-af02-b80e2a614eab","year":2025},{"cited_arxiv_id":"1609.08675","doi":"","is_internal_anchor":true,"ref_index":2,"title":"YouTube-8M: A Large-Scale Video Classification Benchmark","work_id":"6b543bd8-75e8-4c53-9718-b4545e4bc424","year":2016},{"cited_arxiv_id":"2301.11325","doi":"","is_internal_anchor":true,"ref_index":3,"title":"MusicLM: Generating Music From Text","work_id":"15e6566e-1c36-468f-966e-823248cbf87f","year":2023},{"cited_arxiv_id":"2406.02430","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Seed-TTS: A Family of High-Quality Versatile Speech Generation Models","work_id":"6e88ee95-1133-4302-a142-cdf8f9456a8d","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"R. Ardila, M. Branson, K. Davis, M. Henretty, M. Kohler, J. Meyer, R. Morais, L. Saunders, F. M. Tyers, and G. Weber. Common voice: A massively-multilingual speech corpus. In Proceedings of the 12th C","work_id":"c0ea9007-1463-4192-bef0-5bcd366eaa01","year":2020}],"snapshot_sha256":"859abea44efcf4fc6cc8c0c9aa68713d9203c13f325b5c5ee3ec29c643cccefd"},"source":{"id":"2507.08128","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T03:37:56.252379Z","id":"697c620a-a579-4359-8139-070b17ff1d58","model_set":{"reader":"grok-4.3"},"one_line_summary":"Audio Flamingo 3 introduces an open large audio-language model achieving new state-of-the-art results on over 20 audio understanding and reasoning benchmarks using a unified encoder and curriculum training on open data.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Audio Flamingo 3 is a fully open large audio-language model that sets new state-of-the-art results on over twenty audio understanding and reasoning benchmarks using only open-source data.","strongest_claim":"AF3 achieves new SOTA results on over 20+ (long) audio understanding and reasoning benchmarks, surpassing both open-weight and closed-source models trained on much larger datasets.","weakest_assumption":"That the newly introduced datasets and five-stage curriculum produce genuine generalization rather than benchmark-specific gains, and that all comparisons use identical evaluation protocols without undisclosed advantages."}},"verdict_id":"697c620a-a579-4359-8139-070b17ff1d58"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:e2b305a34f73b80bfb9395631f1e8d7124658f9fa6f32de897efcc3e164f39de","target":"record","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"b5e40b9361de399324381bcffd9d049ec056cb96913b881de6ae19dac5ccbeab","cross_cats_sorted":["cs.AI","cs.CL","eess.AS"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SD","submitted_at":"2025-07-10T19:40:21Z","title_canon_sha256":"c6c0fe3e84f24dffea9d926559483dc8b4aae1ad8ed0b973ecc08b5e7237e806"},"schema_version":"1.0","source":{"id":"2507.08128","kind":"arxiv","version":2}},"canonical_sha256":"17cb2e6f38f949526baaaae26a3d0f21c85d4357d9b547c106780a44c96b2fb1","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"17cb2e6f38f949526baaaae26a3d0f21c85d4357d9b547c106780a44c96b2fb1","first_computed_at":"2026-05-17T23:38:53.661752Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:53.661752Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"bkOU/Qt2lajaHiII7lLqk/MnfTVlLw5++I6aNXfDt/7R2uQ95q+go+wLYKDuYLsj9Yg4mq6oLu7XB/fsqu8dDw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:53.662306Z","signed_message":"canonical_sha256_bytes"},"source_id":"2507.08128","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:e2b305a34f73b80bfb9395631f1e8d7124658f9fa6f32de897efcc3e164f39de","sha256:1c46905a98a51e9f5eb28d932ca7284a7c10889fbf2ed50bc46ce23e60bb2402"],"state_sha256":"3927ae3f138fa1e6fed2aa27d76b38ed86bc5b37cf3ad7b1460dd98cb4f8f785"}