{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:H7GNWR6IUNNDWZ7CVXQWCOQDGU","short_pith_number":"pith:H7GNWR6I","canonical_record":{"source":{"id":"2605.14231","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T00:56:51Z","cross_cats_sorted":["cs.AI","cs.SD"],"title_canon_sha256":"3ae7727a22cea6c274585ddfc69c7bbd364b3acea5d0e33cc5667c9818769418","abstract_canon_sha256":"699bfbdd4feb6fbff3f679abbf9a9c8dc2e7ce1c67273572659cb439e106737c"},"schema_version":"1.0"},"canonical_sha256":"3fccdb47c8a35a3b67e2ade1613a0335300ba6142a9d4016092f3ec498c7a0a5","source":{"kind":"arxiv","id":"2605.14231","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14231","created_at":"2026-05-17T23:39:10Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14231v1","created_at":"2026-05-17T23:39:10Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14231","created_at":"2026-05-17T23:39:10Z"},{"alias_kind":"pith_short_12","alias_value":"H7GNWR6IUNND","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"H7GNWR6IUNNDWZ7C","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"H7GNWR6I","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:H7GNWR6IUNNDWZ7CVXQWCOQDGU","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14231","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T00:56:51Z","cross_cats_sorted":["cs.AI","cs.SD"],"title_canon_sha256":"3ae7727a22cea6c274585ddfc69c7bbd364b3acea5d0e33cc5667c9818769418","abstract_canon_sha256":"699bfbdd4feb6fbff3f679abbf9a9c8dc2e7ce1c67273572659cb439e106737c"},"schema_version":"1.0"},"canonical_sha256":"3fccdb47c8a35a3b67e2ade1613a0335300ba6142a9d4016092f3ec498c7a0a5","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:10.737738Z","signature_b64":"5Llt+A8Bok1/s3TaesbxasZHE7MSi/iX9w9LZ1DzvncFg1Ya+Tt70wbmRj6hMgkqr/U7tol1go6qQfesUh3DAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"3fccdb47c8a35a3b67e2ade1613a0335300ba6142a9d4016092f3ec498c7a0a5","last_reissued_at":"2026-05-17T23:39:10.737309Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:10.737309Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14231","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:10Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"3zmepvhFnOPt/zvCrWTx32E1tvMN7tFQ5lcMoZjAq983BNmwcq6vhcYN7NgtqIsGpvOiCGqFSrHROL6x3NQaCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T20:12:55.693588Z"},"content_sha256":"e0ab2e7d9f4654b9e895a6e5cf872f178c418aced62c82264477d902b61c0db3","schema_version":"1.0","event_id":"sha256:e0ab2e7d9f4654b9e895a6e5cf872f178c418aced62c82264477d902b61c0db3"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:H7GNWR6IUNNDWZ7CVXQWCOQDGU","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"AudioMosaic: Contrastive Masked Audio Representation Learning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"AudioMosaic learns stronger audio representations by contrasting structured masked spectrogram patches rather than reconstructing them.","cross_cats":["cs.AI","cs.SD"],"primary_cat":"cs.LG","authors_text":"Christopher Leckie, Cihang Xie, Hanxun Huang, Qizhou Wang, Sarah Erfani, Xingjun Ma","submitted_at":"2026-05-14T00:56:51Z","abstract_excerpt":"Audio self-supervised learning (SSL) aims to learn general-purpose representations from large-scale unlabeled audio data. While recent advances have been driven mainly by generative reconstruction objectives, contrastive approaches remain less explored, partly due to the difficulty of designing effective audio augmentations and the large batch sizes required for contrastive pre-training. We introduce \\textbf{AudioMosaic}, a contrastive learning-based audio encoder for general audio understanding. During pre-training, AudioMosaic constructs positive pairs by applying structured time-frequency m"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"AudioMosaic achieves state-of-the-art performance on several standard audio benchmarks under both linear probing and fine-tuning. We further show that integrating the pretrained AudioMosaic encoder into audio-language models improves performance on audio-language tasks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That structured time-frequency masking on spectrogram patches produces positive pairs effective enough for contrastive learning to outperform generative approaches without requiring complex audio-specific augmentations or impractically large batches.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"AudioMosaic learns general-purpose audio representations through contrastive pre-training with structured spectrogram masking, reaching state-of-the-art results on standard benchmarks and improving audio-language tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"AudioMosaic learns stronger audio representations by contrasting structured masked spectrogram patches rather than reconstructing them.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"212fa80af94e89b09dc062a854c177c18ff49abb55db1ebb6fef6770df8c285a"},"source":{"id":"2605.14231","kind":"arxiv","version":1},"verdict":{"id":"3df96e28-878b-4513-a9e3-6a52a79d3de3","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T01:48:54.282820Z","strongest_claim":"AudioMosaic achieves state-of-the-art performance on several standard audio benchmarks under both linear probing and fine-tuning. We further show that integrating the pretrained AudioMosaic encoder into audio-language models improves performance on audio-language tasks.","one_line_summary":"AudioMosaic learns general-purpose audio representations through contrastive pre-training with structured spectrogram masking, reaching state-of-the-art results on standard benchmarks and improving audio-language tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That structured time-frequency masking on spectrogram patches produces positive pairs effective enough for contrastive learning to outperform generative approaches without requiring complex audio-specific augmentations or impractically large batches.","pith_extraction_headline":"AudioMosaic learns stronger audio representations by contrasting structured masked spectrogram patches rather than reconstructing them."},"references":{"count":18,"sample":[{"doi":"","year":null,"title":"Optimizing audio augmentations for contrastive learn- ing of health-related acoustic signals.arXiv preprint arXiv:2309.05843,","work_id":"a3358304-8e36-4f08-a2d9-5da59630762a","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"A-jepa: Joint-embedding predictive architecture can listen.arXiv preprint arXiv:2311.15830","work_id":"cf2ef9fd-18d7-410e-843d-32334e6ae7a9","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Audio Flamingo 3: Advancing Audio Intelligence with Fully Open Large Audio Language Models","work_id":"67c2892d-8e27-4da1-8198-71c48e673e96","ref_index":3,"cited_arxiv_id":"2507.08128","is_internal_anchor":true},{"doi":"","year":2021,"title":"Ast: Audio spectro- gram transformer","work_id":"63149a19-14f4-4ed6-b5c9-f4ecbdae3608","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Sheet: A multi- purpose open-source speech human evaluation estimation toolkit","work_id":"6968eb5b-a178-41d4-8914-706bb21c13ca","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":18,"snapshot_sha256":"498f7b17521fa238144eeb1d53c97130bf9f4bee3885602a0090d7329d4cefb4","internal_anchors":4},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"3df96e28-878b-4513-a9e3-6a52a79d3de3"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:10Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"hPBmYn+6MqFmzwgmah4crG7jp1RVFkdG8W7Gd6qSc4GFNkNODb6jx0xmPjGFknmQgp2hcXTJCdUIcWbDx8d5Cw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T20:12:55.696419Z"},"content_sha256":"c91b9e5f149c54ce37f3ffae7a96db9cc04c5cd0d2880c14c9c0131013489723","schema_version":"1.0","event_id":"sha256:c91b9e5f149c54ce37f3ffae7a96db9cc04c5cd0d2880c14c9c0131013489723"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/H7GNWR6IUNNDWZ7CVXQWCOQDGU/bundle.json","state_url":"https://pith.science/pith/H7GNWR6IUNNDWZ7CVXQWCOQDGU/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/H7GNWR6IUNNDWZ7CVXQWCOQDGU/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-05T20:12:55Z","links":{"resolver":"https://pith.science/pith/H7GNWR6IUNNDWZ7CVXQWCOQDGU","bundle":"https://pith.science/pith/H7GNWR6IUNNDWZ7CVXQWCOQDGU/bundle.json","state":"https://pith.science/pith/H7GNWR6IUNNDWZ7CVXQWCOQDGU/state.json","well_known_bundle":"https://pith.science/.well-known/pith/H7GNWR6IUNNDWZ7CVXQWCOQDGU/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:H7GNWR6IUNNDWZ7CVXQWCOQDGU","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"699bfbdd4feb6fbff3f679abbf9a9c8dc2e7ce1c67273572659cb439e106737c","cross_cats_sorted":["cs.AI","cs.SD"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T00:56:51Z","title_canon_sha256":"3ae7727a22cea6c274585ddfc69c7bbd364b3acea5d0e33cc5667c9818769418"},"schema_version":"1.0","source":{"id":"2605.14231","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14231","created_at":"2026-05-17T23:39:10Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14231v1","created_at":"2026-05-17T23:39:10Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14231","created_at":"2026-05-17T23:39:10Z"},{"alias_kind":"pith_short_12","alias_value":"H7GNWR6IUNND","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"H7GNWR6IUNNDWZ7C","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"H7GNWR6I","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:c91b9e5f149c54ce37f3ffae7a96db9cc04c5cd0d2880c14c9c0131013489723","target":"graph","created_at":"2026-05-17T23:39:10Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"AudioMosaic achieves state-of-the-art performance on several standard audio benchmarks under both linear probing and fine-tuning. We further show that integrating the pretrained AudioMosaic encoder into audio-language models improves performance on audio-language tasks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That structured time-frequency masking on spectrogram patches produces positive pairs effective enough for contrastive learning to outperform generative approaches without requiring complex audio-specific augmentations or impractically large batches."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"AudioMosaic learns general-purpose audio representations through contrastive pre-training with structured spectrogram masking, reaching state-of-the-art results on standard benchmarks and improving audio-language tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"AudioMosaic learns stronger audio representations by contrasting structured masked spectrogram patches rather than reconstructing them."}],"snapshot_sha256":"212fa80af94e89b09dc062a854c177c18ff49abb55db1ebb6fef6770df8c285a"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Audio self-supervised learning (SSL) aims to learn general-purpose representations from large-scale unlabeled audio data. While recent advances have been driven mainly by generative reconstruction objectives, contrastive approaches remain less explored, partly due to the difficulty of designing effective audio augmentations and the large batch sizes required for contrastive pre-training. We introduce \\textbf{AudioMosaic}, a contrastive learning-based audio encoder for general audio understanding. During pre-training, AudioMosaic constructs positive pairs by applying structured time-frequency m","authors_text":"Christopher Leckie, Cihang Xie, Hanxun Huang, Qizhou Wang, Sarah Erfani, Xingjun Ma","cross_cats":["cs.AI","cs.SD"],"headline":"AudioMosaic learns stronger audio representations by contrasting structured masked spectrogram patches rather than reconstructing them.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T00:56:51Z","title":"AudioMosaic: Contrastive Masked Audio Representation Learning"},"references":{"count":18,"internal_anchors":4,"resolved_work":18,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Optimizing audio augmentations for contrastive learn- ing of health-related acoustic signals.arXiv preprint arXiv:2309.05843,","work_id":"a3358304-8e36-4f08-a2d9-5da59630762a","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"A-jepa: Joint-embedding predictive architecture can listen.arXiv preprint arXiv:2311.15830","work_id":"cf2ef9fd-18d7-410e-843d-32334e6ae7a9","year":null},{"cited_arxiv_id":"2507.08128","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Audio Flamingo 3: Advancing Audio Intelligence with Fully Open Large Audio Language Models","work_id":"67c2892d-8e27-4da1-8198-71c48e673e96","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Ast: Audio spectro- gram transformer","work_id":"63149a19-14f4-4ed6-b5c9-f4ecbdae3608","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Sheet: A multi- purpose open-source speech human evaluation estimation toolkit","work_id":"6968eb5b-a178-41d4-8914-706bb21c13ca","year":2025}],"snapshot_sha256":"498f7b17521fa238144eeb1d53c97130bf9f4bee3885602a0090d7329d4cefb4"},"source":{"id":"2605.14231","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T01:48:54.282820Z","id":"3df96e28-878b-4513-a9e3-6a52a79d3de3","model_set":{"reader":"grok-4.3"},"one_line_summary":"AudioMosaic learns general-purpose audio representations through contrastive pre-training with structured spectrogram masking, reaching state-of-the-art results on standard benchmarks and improving audio-language tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"AudioMosaic learns stronger audio representations by contrasting structured masked spectrogram patches rather than reconstructing them.","strongest_claim":"AudioMosaic achieves state-of-the-art performance on several standard audio benchmarks under both linear probing and fine-tuning. We further show that integrating the pretrained AudioMosaic encoder into audio-language models improves performance on audio-language tasks.","weakest_assumption":"That structured time-frequency masking on spectrogram patches produces positive pairs effective enough for contrastive learning to outperform generative approaches without requiring complex audio-specific augmentations or impractically large batches."}},"verdict_id":"3df96e28-878b-4513-a9e3-6a52a79d3de3"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:e0ab2e7d9f4654b9e895a6e5cf872f178c418aced62c82264477d902b61c0db3","target":"record","created_at":"2026-05-17T23:39:10Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"699bfbdd4feb6fbff3f679abbf9a9c8dc2e7ce1c67273572659cb439e106737c","cross_cats_sorted":["cs.AI","cs.SD"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T00:56:51Z","title_canon_sha256":"3ae7727a22cea6c274585ddfc69c7bbd364b3acea5d0e33cc5667c9818769418"},"schema_version":"1.0","source":{"id":"2605.14231","kind":"arxiv","version":1}},"canonical_sha256":"3fccdb47c8a35a3b67e2ade1613a0335300ba6142a9d4016092f3ec498c7a0a5","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"3fccdb47c8a35a3b67e2ade1613a0335300ba6142a9d4016092f3ec498c7a0a5","first_computed_at":"2026-05-17T23:39:10.737309Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:10.737309Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"5Llt+A8Bok1/s3TaesbxasZHE7MSi/iX9w9LZ1DzvncFg1Ya+Tt70wbmRj6hMgkqr/U7tol1go6qQfesUh3DAg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:10.737738Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14231","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:e0ab2e7d9f4654b9e895a6e5cf872f178c418aced62c82264477d902b61c0db3","sha256:c91b9e5f149c54ce37f3ffae7a96db9cc04c5cd0d2880c14c9c0131013489723"],"state_sha256":"63335b3bd342aca6bd0a75ea766006a9d919fddd41c2bf5956482dd10ba7d151"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"z4kVgqiGkNUNDACGZFmhGSaNrBdzu4G+xB3OFAyGmhkoEbl0cYppT+S/jbKuwX1K9nq88lt+ewhAPR60+qDuBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-05T20:12:55.701375Z","bundle_sha256":"aa5d23ec4ea949e642660659eb06292759f18c7d1501ddec0b7320f656c042d1"}}