{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:ZPTH6N7ZTLWW3QYKEHHVOVZTJU","short_pith_number":"pith:ZPTH6N7Z","schema_version":"1.0","canonical_sha256":"cbe67f37f99aed6dc30a21cf5757334d0be0944f324e16ee864e6f225b8a7b4e","source":{"kind":"arxiv","id":"2310.13289","version":2},"attestation_state":"computed","paper":{"title":"SALMONN: Towards Generic Hearing Abilities for Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","eess.AS"],"primary_cat":"cs.SD","authors_text":"Changli Tang, Chao Zhang, Guangzhi Sun, Lu Lu, Tian Tan, Wei Li, Wenyi Yu, Xianzhao Chen, Zejun Ma","submitted_at":"2023-10-20T05:41:57Z","abstract_excerpt":"Hearing is arguably an essential ability of artificial intelligence (AI) agents in the physical world, which refers to the perception and understanding of general auditory information consisting of at least three types of sounds: speech, audio events, and music. In this paper, we propose SALMONN, a speech audio language music open neural network, built by integrating a pre-trained text-based large language model (LLM) with speech and audio encoders into a single multimodal model. SALMONN enables the LLM to directly process and understand general audio inputs and achieve competitive performance"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2310.13289","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2023-10-20T05:41:57Z","cross_cats_sorted":["cs.CL","eess.AS"],"title_canon_sha256":"cdacaa27bbc9bce6460f12a6c006abeccafde899b701042a49d025b06b52a1c6","abstract_canon_sha256":"7f9ea9a8ff0eecb8730a49a2cb8035ebc631f4056e9f12d15bb2451e31dd7724"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:23:34.307102Z","signature_b64":"NcJo8AiIJoN9b01SsBlI4Io7oVGJeQ7XnKKPc1ZJAVr1WPT7s9CBasGrVWFv7oggQbpY/Q2sLgW5CNbbKaocAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"cbe67f37f99aed6dc30a21cf5757334d0be0944f324e16ee864e6f225b8a7b4e","last_reissued_at":"2026-05-18T02:23:34.306445Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:23:34.306445Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"SALMONN: Towards Generic Hearing Abilities for Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","eess.AS"],"primary_cat":"cs.SD","authors_text":"Changli Tang, Chao Zhang, Guangzhi Sun, Lu Lu, Tian Tan, Wei Li, Wenyi Yu, Xianzhao Chen, Zejun Ma","submitted_at":"2023-10-20T05:41:57Z","abstract_excerpt":"Hearing is arguably an essential ability of artificial intelligence (AI) agents in the physical world, which refers to the perception and understanding of general auditory information consisting of at least three types of sounds: speech, audio events, and music. In this paper, we propose SALMONN, a speech audio language music open neural network, built by integrating a pre-trained text-based large language model (LLM) with speech and audio encoders into a single multimodal model. SALMONN enables the LLM to directly process and understand general audio inputs and achieve competitive performance"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2310.13289","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2310.13289","created_at":"2026-05-18T02:23:34.306532+00:00"},{"alias_kind":"arxiv_version","alias_value":"2310.13289v2","created_at":"2026-05-18T02:23:34.306532+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2310.13289","created_at":"2026-05-18T02:23:34.306532+00:00"},{"alias_kind":"pith_short_12","alias_value":"ZPTH6N7ZTLWW","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"ZPTH6N7ZTLWW3QYK","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"ZPTH6N7Z","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":20,"internal_anchor_count":20,"sample":[{"citing_arxiv_id":"2605.19950","citing_title":"AffectVerse: Emotional World Models for Multimodal Affective Computing","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2505.22765","citing_title":"StressTest: Can YOUR Speech LM Handle the Stress?","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2506.04565","citing_title":"From Standalone LLMs to Integrated Intelligence: A Survey of Compound Al Systems","ref_index":171,"is_internal_anchor":true},{"citing_arxiv_id":"2509.15692","citing_title":"Direct Simultaneous Translation Activation for Large Audio-Language Models","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2502.04326","citing_title":"WorldSense: Evaluating Real-world Omnimodal Understanding for Multimodal LLMs","ref_index":60,"is_internal_anchor":true},{"citing_arxiv_id":"2512.01512","citing_title":"MCAT: Scaling Many-to-Many Speech-to-Text Translation with MLLMs to 70 Languages","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2512.06380","citing_title":"Protecting Bystander Privacy via Selective Hearing in Audio LLMs","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2410.17196","citing_title":"VoiceBench: Benchmarking LLM-Based Voice Assistants","ref_index":96,"is_internal_anchor":true},{"citing_arxiv_id":"2512.20136","citing_title":"M$^3$KG-RAG: Multi-hop Multimodal Knowledge Graph-enhanced Retrieval-Augmented Generation","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2601.02954","citing_title":"The World is Not Mono: Enabling Spatial Understanding in Large Audio-Language Models","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2505.23747","citing_title":"Spatial-MLLM: Boosting MLLM Capabilities in Visual-based Spatial Intelligence","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2507.16632","citing_title":"Step-Audio 2 Technical Report","ref_index":61,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23717","citing_title":"HeadRouter: Dynamic Head-Weight Routing for Task-Adaptive Audio Token Pruning in Large Audio Language Models","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23323","citing_title":"Robust Audio-Text Retrieval via Cross-Modal Attention and Hybrid Loss","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2504.18425","citing_title":"Kimi-Audio Technical Report","ref_index":63,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08703","citing_title":"QoS-QoE Translation with Large Language Model","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2604.09021","citing_title":"Noise-Aware In-Context Learning for Hallucination Mitigation in ALLMs","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2604.09121","citing_title":"Interactive ASR: Towards Human-Like Interaction and Semantic Coherence Evaluation for Agentic Speech Recognition","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13804","citing_title":"Character Beyond Speech: Leveraging Role-Playing Evaluation in Audio Large Language Models via Reinforcement Learning","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2604.12527","citing_title":"Audio-Cogito: Towards Deep Audio Reasoning in Large Audio Language Models","ref_index":7,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ZPTH6N7ZTLWW3QYKEHHVOVZTJU","json":"https://pith.science/pith/ZPTH6N7ZTLWW3QYKEHHVOVZTJU.json","graph_json":"https://pith.science/api/pith-number/ZPTH6N7ZTLWW3QYKEHHVOVZTJU/graph.json","events_json":"https://pith.science/api/pith-number/ZPTH6N7ZTLWW3QYKEHHVOVZTJU/events.json","paper":"https://pith.science/paper/ZPTH6N7Z"},"agent_actions":{"view_html":"https://pith.science/pith/ZPTH6N7ZTLWW3QYKEHHVOVZTJU","download_json":"https://pith.science/pith/ZPTH6N7ZTLWW3QYKEHHVOVZTJU.json","view_paper":"https://pith.science/paper/ZPTH6N7Z","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2310.13289&json=true","fetch_graph":"https://pith.science/api/pith-number/ZPTH6N7ZTLWW3QYKEHHVOVZTJU/graph.json","fetch_events":"https://pith.science/api/pith-number/ZPTH6N7ZTLWW3QYKEHHVOVZTJU/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ZPTH6N7ZTLWW3QYKEHHVOVZTJU/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ZPTH6N7ZTLWW3QYKEHHVOVZTJU/action/storage_attestation","attest_author":"https://pith.science/pith/ZPTH6N7ZTLWW3QYKEHHVOVZTJU/action/author_attestation","sign_citation":"https://pith.science/pith/ZPTH6N7ZTLWW3QYKEHHVOVZTJU/action/citation_signature","submit_replication":"https://pith.science/pith/ZPTH6N7ZTLWW3QYKEHHVOVZTJU/action/replication_record"}},"created_at":"2026-05-18T02:23:34.306532+00:00","updated_at":"2026-05-18T02:23:34.306532+00:00"}