{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:Y3FB54FRTYVHNZ2JYSFBWWZ5A2","short_pith_number":"pith:Y3FB54FR","schema_version":"1.0","canonical_sha256":"c6ca1ef0b19e2a76e749c48a1b5b3d06a088a3a4a9cf3ff96fe605f9664fc5b8","source":{"kind":"arxiv","id":"2605.29628","version":1},"attestation_state":"computed","paper":{"title":"COMET: Concept Space Dissection of the Modality Gap in Audio-Text Multimodal Contrastive Embeddings","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.LG","eess.AS"],"primary_cat":"cs.SD","authors_text":"Aidong Men, Liting Gao, Wenwu Wang, Yonggang Zhu","submitted_at":"2026-05-28T09:00:44Z","abstract_excerpt":"Contrastive Language-Audio Pretraining (CLAP) models are widely used for audio understanding and support modality-agnostic condition swapping in many zero-shot applications. However, their performance is heavily affected by the modality gap between audio and text embeddings. Existing explanations mainly attribute this gap to the cone effect, treating it as a shift between mean embeddings, yet correcting the mean alone yields only limited improvements. Alternative hypotheses, such as information imbalance and dimensionality collapse, have also been proposed, but they remain insufficiently verif"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.29628","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2026-05-28T09:00:44Z","cross_cats_sorted":["cs.AI","cs.CL","cs.LG","eess.AS"],"title_canon_sha256":"64acf0129969e3080c4de71e0f0ada9af5eba3ff42f3cfc97a5514d05be22515","abstract_canon_sha256":"214ea80155e1efa6d67bb122b1b9139f7cba38f2f9034be83610064656d568ae"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-29T01:05:51.802787Z","signature_b64":"Lavf4E1y3z6/rmixIEooGlm+nRXOujSXZuh37fUKKTYrTqoUp3j/JxA4zeX1oKAHqvFQlaSkxmG1ZmR5ia15AQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c6ca1ef0b19e2a76e749c48a1b5b3d06a088a3a4a9cf3ff96fe605f9664fc5b8","last_reissued_at":"2026-05-29T01:05:51.802159Z","signature_status":"signed_v1","first_computed_at":"2026-05-29T01:05:51.802159Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"COMET: Concept Space Dissection of the Modality Gap in Audio-Text Multimodal Contrastive Embeddings","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.LG","eess.AS"],"primary_cat":"cs.SD","authors_text":"Aidong Men, Liting Gao, Wenwu Wang, Yonggang Zhu","submitted_at":"2026-05-28T09:00:44Z","abstract_excerpt":"Contrastive Language-Audio Pretraining (CLAP) models are widely used for audio understanding and support modality-agnostic condition swapping in many zero-shot applications. However, their performance is heavily affected by the modality gap between audio and text embeddings. Existing explanations mainly attribute this gap to the cone effect, treating it as a shift between mean embeddings, yet correcting the mean alone yields only limited improvements. Alternative hypotheses, such as information imbalance and dimensionality collapse, have also been proposed, but they remain insufficiently verif"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.29628","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.29628/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.29628","created_at":"2026-05-29T01:05:51.802267+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.29628v1","created_at":"2026-05-29T01:05:51.802267+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.29628","created_at":"2026-05-29T01:05:51.802267+00:00"},{"alias_kind":"pith_short_12","alias_value":"Y3FB54FRTYVH","created_at":"2026-05-29T01:05:51.802267+00:00"},{"alias_kind":"pith_short_16","alias_value":"Y3FB54FRTYVHNZ2J","created_at":"2026-05-29T01:05:51.802267+00:00"},{"alias_kind":"pith_short_8","alias_value":"Y3FB54FR","created_at":"2026-05-29T01:05:51.802267+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Y3FB54FRTYVHNZ2JYSFBWWZ5A2","json":"https://pith.science/pith/Y3FB54FRTYVHNZ2JYSFBWWZ5A2.json","graph_json":"https://pith.science/api/pith-number/Y3FB54FRTYVHNZ2JYSFBWWZ5A2/graph.json","events_json":"https://pith.science/api/pith-number/Y3FB54FRTYVHNZ2JYSFBWWZ5A2/events.json","paper":"https://pith.science/paper/Y3FB54FR"},"agent_actions":{"view_html":"https://pith.science/pith/Y3FB54FRTYVHNZ2JYSFBWWZ5A2","download_json":"https://pith.science/pith/Y3FB54FRTYVHNZ2JYSFBWWZ5A2.json","view_paper":"https://pith.science/paper/Y3FB54FR","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.29628&json=true","fetch_graph":"https://pith.science/api/pith-number/Y3FB54FRTYVHNZ2JYSFBWWZ5A2/graph.json","fetch_events":"https://pith.science/api/pith-number/Y3FB54FRTYVHNZ2JYSFBWWZ5A2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Y3FB54FRTYVHNZ2JYSFBWWZ5A2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Y3FB54FRTYVHNZ2JYSFBWWZ5A2/action/storage_attestation","attest_author":"https://pith.science/pith/Y3FB54FRTYVHNZ2JYSFBWWZ5A2/action/author_attestation","sign_citation":"https://pith.science/pith/Y3FB54FRTYVHNZ2JYSFBWWZ5A2/action/citation_signature","submit_replication":"https://pith.science/pith/Y3FB54FRTYVHNZ2JYSFBWWZ5A2/action/replication_record"}},"created_at":"2026-05-29T01:05:51.802267+00:00","updated_at":"2026-05-29T01:05:51.802267+00:00"}