{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:4LJGKOX2EW7ZKHM5XEXO7UJKMJ","short_pith_number":"pith:4LJGKOX2","schema_version":"1.0","canonical_sha256":"e2d2653afa25bf951d9db92eefd12a625975049ac95c9367de8cc1197b6c7594","source":{"kind":"arxiv","id":"2607.01563","version":1},"attestation_state":"computed","paper":{"title":"Beyond Words: Towards Effective Modeling of Non-Verbal Vocalizations in ASR","license":"http://creativecommons.org/publicdomain/zero/1.0/","headline":"","cross_cats":[],"primary_cat":"eess.AS","authors_text":"Bach Do, Florian Metze, Gene Yang, Haibin Wu, Ming Sun, Minxue Niu, Peng Su, Ruizhe Huang, Shang-Wen Li, Suwon Shon, Yossi Adi, Yuzong Liu, Zhaoheng Ni","submitted_at":"2026-07-02T00:43:14Z","abstract_excerpt":"Modern automatic speech recognition (ASR) systems excel at transcribing lexical content but often omit nonverbal vocalizations (NVs), such as laughter, breaths, coughs, and cries, that carry conversational and affective information. Modeling NVs in ASR is challenging because NV annotations are sparse and highly long-tailed, with frequent categories such as breaths and laughter dominating rarer events such as cries and coughs. We study three data-centric strategies for improving low-resource NV recognition: (1) a two-stage curriculum that first maps all NV events to a generic token and then fin"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2607.01563","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/publicdomain/zero/1.0/","primary_cat":"eess.AS","submitted_at":"2026-07-02T00:43:14Z","cross_cats_sorted":[],"title_canon_sha256":"f3b1de34ce32039604ece2c61d42d3b4c08fd1281a103623b04573856a35bbb5","abstract_canon_sha256":"193f38613c0f4df12fb00aabb407c6f8ed8c2f1a9a5d9e37e85f11bc070ef266"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-03T00:17:04.131610Z","signature_b64":"Lji8hYQPgOldUmg3zRojsfakJhaPfa+KyeeHzLqhyOcTfo6usEHQXTk0Z+KyKnShwTqlQodxNxB9joe6c/iYCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e2d2653afa25bf951d9db92eefd12a625975049ac95c9367de8cc1197b6c7594","last_reissued_at":"2026-07-03T00:17:04.131168Z","signature_status":"signed_v1","first_computed_at":"2026-07-03T00:17:04.131168Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Beyond Words: Towards Effective Modeling of Non-Verbal Vocalizations in ASR","license":"http://creativecommons.org/publicdomain/zero/1.0/","headline":"","cross_cats":[],"primary_cat":"eess.AS","authors_text":"Bach Do, Florian Metze, Gene Yang, Haibin Wu, Ming Sun, Minxue Niu, Peng Su, Ruizhe Huang, Shang-Wen Li, Suwon Shon, Yossi Adi, Yuzong Liu, Zhaoheng Ni","submitted_at":"2026-07-02T00:43:14Z","abstract_excerpt":"Modern automatic speech recognition (ASR) systems excel at transcribing lexical content but often omit nonverbal vocalizations (NVs), such as laughter, breaths, coughs, and cries, that carry conversational and affective information. Modeling NVs in ASR is challenging because NV annotations are sparse and highly long-tailed, with frequent categories such as breaths and laughter dominating rarer events such as cries and coughs. We study three data-centric strategies for improving low-resource NV recognition: (1) a two-stage curriculum that first maps all NV events to a generic token and then fin"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2607.01563","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2607.01563/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2607.01563","created_at":"2026-07-03T00:17:04.131225+00:00"},{"alias_kind":"arxiv_version","alias_value":"2607.01563v1","created_at":"2026-07-03T00:17:04.131225+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2607.01563","created_at":"2026-07-03T00:17:04.131225+00:00"},{"alias_kind":"pith_short_12","alias_value":"4LJGKOX2EW7Z","created_at":"2026-07-03T00:17:04.131225+00:00"},{"alias_kind":"pith_short_16","alias_value":"4LJGKOX2EW7ZKHM5","created_at":"2026-07-03T00:17:04.131225+00:00"},{"alias_kind":"pith_short_8","alias_value":"4LJGKOX2","created_at":"2026-07-03T00:17:04.131225+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4LJGKOX2EW7ZKHM5XEXO7UJKMJ","json":"https://pith.science/pith/4LJGKOX2EW7ZKHM5XEXO7UJKMJ.json","graph_json":"https://pith.science/api/pith-number/4LJGKOX2EW7ZKHM5XEXO7UJKMJ/graph.json","events_json":"https://pith.science/api/pith-number/4LJGKOX2EW7ZKHM5XEXO7UJKMJ/events.json","paper":"https://pith.science/paper/4LJGKOX2"},"agent_actions":{"view_html":"https://pith.science/pith/4LJGKOX2EW7ZKHM5XEXO7UJKMJ","download_json":"https://pith.science/pith/4LJGKOX2EW7ZKHM5XEXO7UJKMJ.json","view_paper":"https://pith.science/paper/4LJGKOX2","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2607.01563&json=true","fetch_graph":"https://pith.science/api/pith-number/4LJGKOX2EW7ZKHM5XEXO7UJKMJ/graph.json","fetch_events":"https://pith.science/api/pith-number/4LJGKOX2EW7ZKHM5XEXO7UJKMJ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4LJGKOX2EW7ZKHM5XEXO7UJKMJ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4LJGKOX2EW7ZKHM5XEXO7UJKMJ/action/storage_attestation","attest_author":"https://pith.science/pith/4LJGKOX2EW7ZKHM5XEXO7UJKMJ/action/author_attestation","sign_citation":"https://pith.science/pith/4LJGKOX2EW7ZKHM5XEXO7UJKMJ/action/citation_signature","submit_replication":"https://pith.science/pith/4LJGKOX2EW7ZKHM5XEXO7UJKMJ/action/replication_record"}},"created_at":"2026-07-03T00:17:04.131225+00:00","updated_at":"2026-07-03T00:17:04.131225+00:00"}