{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:HSTTNVUORU4ZRZZQP5PP7FZFVQ","short_pith_number":"pith:HSTTNVUO","canonical_record":{"source":{"id":"2502.05139","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2025-02-07T18:15:57Z","cross_cats_sorted":["cs.LG","eess.AS"],"title_canon_sha256":"7e2a263840a07dc04a37d63739240a33f090144e9013b03a9d60c4531c8cf641","abstract_canon_sha256":"5c8b048be85b85080bdafd9271bca2c65b40c3d294bc31f6f863deb31734fee3"},"schema_version":"1.0"},"canonical_sha256":"3ca736d68e8d3998e7307f5eff9725ac12f321b5e759d8aa79082852f7d7997c","source":{"kind":"arxiv","id":"2502.05139","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2502.05139","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"arxiv_version","alias_value":"2502.05139v1","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.05139","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"pith_short_12","alias_value":"HSTTNVUORU4Z","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"HSTTNVUORU4ZRZZQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"HSTTNVUO","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:HSTTNVUORU4ZRZZQP5PP7FZFVQ","target":"record","payload":{"canonical_record":{"source":{"id":"2502.05139","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2025-02-07T18:15:57Z","cross_cats_sorted":["cs.LG","eess.AS"],"title_canon_sha256":"7e2a263840a07dc04a37d63739240a33f090144e9013b03a9d60c4531c8cf641","abstract_canon_sha256":"5c8b048be85b85080bdafd9271bca2c65b40c3d294bc31f6f863deb31734fee3"},"schema_version":"1.0"},"canonical_sha256":"3ca736d68e8d3998e7307f5eff9725ac12f321b5e759d8aa79082852f7d7997c","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:46.108479Z","signature_b64":"OIIy/wJ9mxPpR6i7hl/42rthu+DTrZzVvqmDy2S5klwQ4TseOTBa0vV4N5U8p0sI5FvgI0laLITJPVUgblfZAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"3ca736d68e8d3998e7307f5eff9725ac12f321b5e759d8aa79082852f7d7997c","last_reissued_at":"2026-05-17T23:38:46.107980Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:46.107980Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2502.05139","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"UADPB565OKdhIn0NlIrS7Sc6yUbWN7UziF+F10bVfmakvtTQl9A6PJa3C+T/eIgYJkMoFvv2aHEZcubbej+eAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T22:28:05.363189Z"},"content_sha256":"0ca6202acd2a3f0fb6c86a89a6c2035ef6b454300fa2435bd3132286a64fe018","schema_version":"1.0","event_id":"sha256:0ca6202acd2a3f0fb6c86a89a6c2035ef6b454300fa2435bd3132286a64fe018"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:HSTTNVUORU4ZRZZQP5PP7FZFVQ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Meta Audiobox Aesthetics: Unified Automatic Quality Assessment for Speech, Music, and Sound","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Decomposing audio aesthetics into four axes lets automatic models predict quality for speech, music, and sound at human-comparable levels.","cross_cats":["cs.LG","eess.AS"],"primary_cat":"cs.SD","authors_text":"Andros Tjandra, Ann Lee, Apoorv Vyas, Baishan Guo, Bowen Shi, Brian Ellis, Carleigh Wood, John Hoffman, Matt Le, Nick Zacharov, Sanyuan Chen, Wei-Ning Hsu, Yi-Chiao Wu","submitted_at":"2025-02-07T18:15:57Z","abstract_excerpt":"The quantification of audio aesthetics remains a complex challenge in audio processing, primarily due to its subjective nature, which is influenced by human perception and cultural context. Traditional methods often depend on human listeners for evaluation, leading to inconsistencies and high resource demands. This paper addresses the growing need for automated systems capable of predicting audio aesthetics without human intervention. Such systems are crucial for applications like data filtering, pseudo-labeling large datasets, and evaluating generative audio models, especially as these models"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our models are evaluated against human mean opinion scores (MOS) and existing methods, demonstrating comparable or superior performance.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The four-axis annotation guidelines sufficiently capture the subjective and culturally influenced nature of audio aesthetics for the tested domains and generalize to new data.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Unified no-reference models assess audio aesthetics across speech, music, and sound via four perceptual axes and achieve performance comparable or superior to human mean opinion scores.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Decomposing audio aesthetics into four axes lets automatic models predict quality for speech, music, and sound at human-comparable levels.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"5b605faef8840331dc7670822f8fbdc26228848371a2809a476bac83e4acc028"},"source":{"id":"2502.05139","kind":"arxiv","version":1},"verdict":{"id":"25c3e294-150c-42c7-ae14-85383052eb7d","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T00:25:32.689178Z","strongest_claim":"Our models are evaluated against human mean opinion scores (MOS) and existing methods, demonstrating comparable or superior performance.","one_line_summary":"Unified no-reference models assess audio aesthetics across speech, music, and sound via four perceptual axes and achieve performance comparable or superior to human mean opinion scores.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The four-axis annotation guidelines sufficiently capture the subjective and culturally influenced nature of audio aesthetics for the tested domains and generalize to new data.","pith_extraction_headline":"Decomposing audio aesthetics into four axes lets automatic models predict quality for speech, music, and sound at human-comparable levels."},"references":{"count":72,"sample":[{"doi":"","year":null,"title":"Davis and Paul Mermelstein , Journal =","work_id":"0d79378d-d093-4d89-bbb7-462aafb12190","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Rabiner , Journal =","work_id":"5174cef0-8785-4987-8f97-ee52301528dc","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"The Elements of Statistical Learning -- Data Mining, Inference, and Prediction , Year =","work_id":"f3f4726f-3dd3-4cda-aa41-482cd780f997","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"A really good paper about","work_id":"dce4b865-110c-4b29-85b1-43054ef4162b","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"An excellent paper introducing the","work_id":"0c705b6b-5a6f-4615-b73c-345d78aba372","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":72,"snapshot_sha256":"ee56504583da662c675930dd688d68826d7a89cf391e3cb8d69a32d1d8c30874","internal_anchors":9},"formal_canon":{"evidence_count":2,"snapshot_sha256":"24b78bfde7f9816f8fe7fa7f4c2bf5f2d5a80e6bff27677231c2bb7ad88bb709"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"25c3e294-150c-42c7-ae14-85383052eb7d"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"NPaSKLs9VEI+zu9hrXZtXp9YFVqu5mGsNG40bvEgzhIPct375VdRpD8LCnyHplndxBsxkVhvc1HnbhLBpZbJAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T22:28:05.364056Z"},"content_sha256":"16fc194c69915701ba5a269e41aca36f330188c74ce185654f60f064f5f4a9c1","schema_version":"1.0","event_id":"sha256:16fc194c69915701ba5a269e41aca36f330188c74ce185654f60f064f5f4a9c1"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/HSTTNVUORU4ZRZZQP5PP7FZFVQ/bundle.json","state_url":"https://pith.science/pith/HSTTNVUORU4ZRZZQP5PP7FZFVQ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/HSTTNVUORU4ZRZZQP5PP7FZFVQ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T22:28:05Z","links":{"resolver":"https://pith.science/pith/HSTTNVUORU4ZRZZQP5PP7FZFVQ","bundle":"https://pith.science/pith/HSTTNVUORU4ZRZZQP5PP7FZFVQ/bundle.json","state":"https://pith.science/pith/HSTTNVUORU4ZRZZQP5PP7FZFVQ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/HSTTNVUORU4ZRZZQP5PP7FZFVQ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:HSTTNVUORU4ZRZZQP5PP7FZFVQ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5c8b048be85b85080bdafd9271bca2c65b40c3d294bc31f6f863deb31734fee3","cross_cats_sorted":["cs.LG","eess.AS"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2025-02-07T18:15:57Z","title_canon_sha256":"7e2a263840a07dc04a37d63739240a33f090144e9013b03a9d60c4531c8cf641"},"schema_version":"1.0","source":{"id":"2502.05139","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2502.05139","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"arxiv_version","alias_value":"2502.05139v1","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.05139","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"pith_short_12","alias_value":"HSTTNVUORU4Z","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"HSTTNVUORU4ZRZZQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"HSTTNVUO","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:16fc194c69915701ba5a269e41aca36f330188c74ce185654f60f064f5f4a9c1","target":"graph","created_at":"2026-05-17T23:38:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Our models are evaluated against human mean opinion scores (MOS) and existing methods, demonstrating comparable or superior performance."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The four-axis annotation guidelines sufficiently capture the subjective and culturally influenced nature of audio aesthetics for the tested domains and generalize to new data."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Unified no-reference models assess audio aesthetics across speech, music, and sound via four perceptual axes and achieve performance comparable or superior to human mean opinion scores."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Decomposing audio aesthetics into four axes lets automatic models predict quality for speech, music, and sound at human-comparable levels."}],"snapshot_sha256":"5b605faef8840331dc7670822f8fbdc26228848371a2809a476bac83e4acc028"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"24b78bfde7f9816f8fe7fa7f4c2bf5f2d5a80e6bff27677231c2bb7ad88bb709"},"paper":{"abstract_excerpt":"The quantification of audio aesthetics remains a complex challenge in audio processing, primarily due to its subjective nature, which is influenced by human perception and cultural context. Traditional methods often depend on human listeners for evaluation, leading to inconsistencies and high resource demands. This paper addresses the growing need for automated systems capable of predicting audio aesthetics without human intervention. Such systems are crucial for applications like data filtering, pseudo-labeling large datasets, and evaluating generative audio models, especially as these models","authors_text":"Andros Tjandra, Ann Lee, Apoorv Vyas, Baishan Guo, Bowen Shi, Brian Ellis, Carleigh Wood, John Hoffman, Matt Le, Nick Zacharov, Sanyuan Chen, Wei-Ning Hsu, Yi-Chiao Wu","cross_cats":["cs.LG","eess.AS"],"headline":"Decomposing audio aesthetics into four axes lets automatic models predict quality for speech, music, and sound at human-comparable levels.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2025-02-07T18:15:57Z","title":"Meta Audiobox Aesthetics: Unified Automatic Quality Assessment for Speech, Music, and Sound"},"references":{"count":72,"internal_anchors":9,"resolved_work":72,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Davis and Paul Mermelstein , Journal =","work_id":"0d79378d-d093-4d89-bbb7-462aafb12190","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Rabiner , Journal =","work_id":"5174cef0-8785-4987-8f97-ee52301528dc","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"The Elements of Statistical Learning -- Data Mining, Inference, and Prediction , Year =","work_id":"f3f4726f-3dd3-4cda-aa41-482cd780f997","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"A really good paper about","work_id":"dce4b865-110c-4b29-85b1-43054ef4162b","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"An excellent paper introducing the","work_id":"0c705b6b-5a6f-4615-b73c-345d78aba372","year":null}],"snapshot_sha256":"ee56504583da662c675930dd688d68826d7a89cf391e3cb8d69a32d1d8c30874"},"source":{"id":"2502.05139","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-17T00:25:32.689178Z","id":"25c3e294-150c-42c7-ae14-85383052eb7d","model_set":{"reader":"grok-4.3"},"one_line_summary":"Unified no-reference models assess audio aesthetics across speech, music, and sound via four perceptual axes and achieve performance comparable or superior to human mean opinion scores.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Decomposing audio aesthetics into four axes lets automatic models predict quality for speech, music, and sound at human-comparable levels.","strongest_claim":"Our models are evaluated against human mean opinion scores (MOS) and existing methods, demonstrating comparable or superior performance.","weakest_assumption":"The four-axis annotation guidelines sufficiently capture the subjective and culturally influenced nature of audio aesthetics for the tested domains and generalize to new data."}},"verdict_id":"25c3e294-150c-42c7-ae14-85383052eb7d"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:0ca6202acd2a3f0fb6c86a89a6c2035ef6b454300fa2435bd3132286a64fe018","target":"record","created_at":"2026-05-17T23:38:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5c8b048be85b85080bdafd9271bca2c65b40c3d294bc31f6f863deb31734fee3","cross_cats_sorted":["cs.LG","eess.AS"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2025-02-07T18:15:57Z","title_canon_sha256":"7e2a263840a07dc04a37d63739240a33f090144e9013b03a9d60c4531c8cf641"},"schema_version":"1.0","source":{"id":"2502.05139","kind":"arxiv","version":1}},"canonical_sha256":"3ca736d68e8d3998e7307f5eff9725ac12f321b5e759d8aa79082852f7d7997c","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"3ca736d68e8d3998e7307f5eff9725ac12f321b5e759d8aa79082852f7d7997c","first_computed_at":"2026-05-17T23:38:46.107980Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:46.107980Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"OIIy/wJ9mxPpR6i7hl/42rthu+DTrZzVvqmDy2S5klwQ4TseOTBa0vV4N5U8p0sI5FvgI0laLITJPVUgblfZAQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:46.108479Z","signed_message":"canonical_sha256_bytes"},"source_id":"2502.05139","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:0ca6202acd2a3f0fb6c86a89a6c2035ef6b454300fa2435bd3132286a64fe018","sha256:16fc194c69915701ba5a269e41aca36f330188c74ce185654f60f064f5f4a9c1"],"state_sha256":"dc709e364a72b2ebc3aa24eee517d89ca1c32e16263394a53710bc765029c6b3"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"NaCi9uSzz7p22w6v3WmOWmfwBbO6TVFbndCv6f1THXKSd47ja3l1Y5xgglPiMyZ62LXbWrJF1SpQaqsHz3zeBQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T22:28:05.367742Z","bundle_sha256":"33b7a85dd6399c02eb7f4ea8ba1e457aacf04a734af8f7421aa94d1e79a57512"}}