{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2016:INO4DLMQNZVATFM4VDE2XJEB33","short_pith_number":"pith:INO4DLMQ","schema_version":"1.0","canonical_sha256":"435dc1ad906e6a09959ca8c9aba481dec36231297cba39d35c7ef91bc05914fb","source":{"kind":"arxiv","id":"1611.09207","version":1},"attestation_state":"computed","paper":{"title":"AutoMOS: Learning a non-intrusive assessor of naturalness-of-speech","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.CL","authors_text":"Brian Patton, D. Sculley, Kevin Wilson, Michael Terry, Rif A. Saurous, Yannis Agiomyrgiannakis","submitted_at":"2016-11-28T15:51:25Z","abstract_excerpt":"Developers of text-to-speech synthesizers (TTS) often make use of human raters to assess the quality of synthesized speech. We demonstrate that we can model human raters' mean opinion scores (MOS) of synthesized speech using a deep recurrent neural network whose inputs consist solely of a raw waveform. Our best models provide utterance-level estimates of MOS only moderately inferior to sampled human ratings, as shown by Pearson and Spearman correlations. When multiple utterances are scored and averaged, a scenario common in synthesizer quality assessment, AutoMOS achieves correlations approach"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1611.09207","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-11-28T15:51:25Z","cross_cats_sorted":["cs.LG","stat.ML"],"title_canon_sha256":"7b518e5099dbe023b864c516e2ea09cf7c00930b828a17d0a79a28aef5881ad8","abstract_canon_sha256":"b53fc72c694336ca2b9e1247241a6cc71224ac4fbf3c619d2060f20c64808c11"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:56:28.816815Z","signature_b64":"aAjehYYT/epsUSBOg2VSqKPRXXV2vbHzmnkiP0LpCauob1S0L1dIeTid/7S97yaTVOPBcjOQewC1imVSka9qDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"435dc1ad906e6a09959ca8c9aba481dec36231297cba39d35c7ef91bc05914fb","last_reissued_at":"2026-05-18T00:56:28.816028Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:56:28.816028Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"AutoMOS: Learning a non-intrusive assessor of naturalness-of-speech","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.CL","authors_text":"Brian Patton, D. Sculley, Kevin Wilson, Michael Terry, Rif A. Saurous, Yannis Agiomyrgiannakis","submitted_at":"2016-11-28T15:51:25Z","abstract_excerpt":"Developers of text-to-speech synthesizers (TTS) often make use of human raters to assess the quality of synthesized speech. We demonstrate that we can model human raters' mean opinion scores (MOS) of synthesized speech using a deep recurrent neural network whose inputs consist solely of a raw waveform. Our best models provide utterance-level estimates of MOS only moderately inferior to sampled human ratings, as shown by Pearson and Spearman correlations. When multiple utterances are scored and averaged, a scenario common in synthesizer quality assessment, AutoMOS achieves correlations approach"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1611.09207","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1611.09207","created_at":"2026-05-18T00:56:28.816157+00:00"},{"alias_kind":"arxiv_version","alias_value":"1611.09207v1","created_at":"2026-05-18T00:56:28.816157+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1611.09207","created_at":"2026-05-18T00:56:28.816157+00:00"},{"alias_kind":"pith_short_12","alias_value":"INO4DLMQNZVA","created_at":"2026-05-18T12:30:22.444734+00:00"},{"alias_kind":"pith_short_16","alias_value":"INO4DLMQNZVATFM4","created_at":"2026-05-18T12:30:22.444734+00:00"},{"alias_kind":"pith_short_8","alias_value":"INO4DLMQ","created_at":"2026-05-18T12:30:22.444734+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2502.05139","citing_title":"Meta Audiobox Aesthetics: Unified Automatic Quality Assessment for Speech, Music, and Sound","ref_index":77,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/INO4DLMQNZVATFM4VDE2XJEB33","json":"https://pith.science/pith/INO4DLMQNZVATFM4VDE2XJEB33.json","graph_json":"https://pith.science/api/pith-number/INO4DLMQNZVATFM4VDE2XJEB33/graph.json","events_json":"https://pith.science/api/pith-number/INO4DLMQNZVATFM4VDE2XJEB33/events.json","paper":"https://pith.science/paper/INO4DLMQ"},"agent_actions":{"view_html":"https://pith.science/pith/INO4DLMQNZVATFM4VDE2XJEB33","download_json":"https://pith.science/pith/INO4DLMQNZVATFM4VDE2XJEB33.json","view_paper":"https://pith.science/paper/INO4DLMQ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1611.09207&json=true","fetch_graph":"https://pith.science/api/pith-number/INO4DLMQNZVATFM4VDE2XJEB33/graph.json","fetch_events":"https://pith.science/api/pith-number/INO4DLMQNZVATFM4VDE2XJEB33/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/INO4DLMQNZVATFM4VDE2XJEB33/action/timestamp_anchor","attest_storage":"https://pith.science/pith/INO4DLMQNZVATFM4VDE2XJEB33/action/storage_attestation","attest_author":"https://pith.science/pith/INO4DLMQNZVATFM4VDE2XJEB33/action/author_attestation","sign_citation":"https://pith.science/pith/INO4DLMQNZVATFM4VDE2XJEB33/action/citation_signature","submit_replication":"https://pith.science/pith/INO4DLMQNZVATFM4VDE2XJEB33/action/replication_record"}},"created_at":"2026-05-18T00:56:28.816157+00:00","updated_at":"2026-05-18T00:56:28.816157+00:00"}