{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:GMH4UAMFDXWDWTBEAYXTDM7KOR","short_pith_number":"pith:GMH4UAMF","canonical_record":{"source":{"id":"2605.06582","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:11:22Z","cross_cats_sorted":["cs.CL","cs.SD"],"title_canon_sha256":"59807b35e32d4a73ebd341180dead6b1f03cafad47f37df62a6354572ca93743","abstract_canon_sha256":"7b681725430bf02f2e528276b1172415e2a4c1fc77a25c71711f1db6a0af57e6"},"schema_version":"1.0"},"canonical_sha256":"330fca01851dec3b4c24062f31b3ea745f2547c7a9d3c564477ee7730ed16628","source":{"kind":"arxiv","id":"2605.06582","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.06582","created_at":"2026-06-09T02:07:28Z"},{"alias_kind":"arxiv_version","alias_value":"2605.06582v2","created_at":"2026-06-09T02:07:28Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.06582","created_at":"2026-06-09T02:07:28Z"},{"alias_kind":"pith_short_12","alias_value":"GMH4UAMFDXWD","created_at":"2026-06-09T02:07:28Z"},{"alias_kind":"pith_short_16","alias_value":"GMH4UAMFDXWDWTBE","created_at":"2026-06-09T02:07:28Z"},{"alias_kind":"pith_short_8","alias_value":"GMH4UAMF","created_at":"2026-06-09T02:07:28Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:GMH4UAMFDXWDWTBEAYXTDM7KOR","target":"record","payload":{"canonical_record":{"source":{"id":"2605.06582","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:11:22Z","cross_cats_sorted":["cs.CL","cs.SD"],"title_canon_sha256":"59807b35e32d4a73ebd341180dead6b1f03cafad47f37df62a6354572ca93743","abstract_canon_sha256":"7b681725430bf02f2e528276b1172415e2a4c1fc77a25c71711f1db6a0af57e6"},"schema_version":"1.0"},"canonical_sha256":"330fca01851dec3b4c24062f31b3ea745f2547c7a9d3c564477ee7730ed16628","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T02:07:28.589649Z","signature_b64":"PutzwJefed65KY7VS4un6zRuo06cPZjKpKen/0sxW6rx+FGwYWQUvBAjPvbmW/ei4qUVe+OkgL7jIsCirmXQAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"330fca01851dec3b4c24062f31b3ea745f2547c7a9d3c564477ee7730ed16628","last_reissued_at":"2026-06-09T02:07:28.588832Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T02:07:28.588832Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.06582","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-09T02:07:28Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"0qBNaE7ePd9pTewAG0P6UySX8iz1xqNXieG4z0XuKFtD5nc01jjY9XdHBduBAbSer3L4/ci7+OwRUrisb9bhBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T15:03:36.899756Z"},"content_sha256":"a894e08f9374646ca6a3f115edf10387d7ee371a23e85f8baf64771ea278e285","schema_version":"1.0","event_id":"sha256:a894e08f9374646ca6a3f115edf10387d7ee371a23e85f8baf64771ea278e285"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:GMH4UAMFDXWDWTBEAYXTDM7KOR","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"PairAlign: A Framework for Sequence Tokenization via Self-Alignment with Applications to Audio Tokenization","license":"http://creativecommons.org/licenses/by/4.0/","headline":"PairAlign generates compact audio token sequences by training each view's output to be likely under the other's encoder while contrasting unrelated examples.","cross_cats":["cs.CL","cs.SD"],"primary_cat":"cs.LG","authors_text":"Adhiraj Banerjee, Vipul Arora","submitted_at":"2026-05-07T17:11:22Z","abstract_excerpt":"Many operations on sensory data -- comparison, memory, retrieval, and reasoning -- are naturally expressed over discrete symbolic structures. In language this interface is given by tokens; in audio, it must be learned. Existing audio tokenizers rely on quantization, clustering, or codec reconstruction, assigning tokens locally, so sequence consistency, compactness, length control, termination, and edit similarity are rarely optimized directly.\n  We introduce PairAlign, a framework for compact audio tokenization through sequence-level self-alignment. PairAlign treats tokenization as conditional"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"On TIMIT retrieval, it preserves edit-distance search while reducing archive token count by 55%. A continuous-sweep probe shows lower local overlap than a dense geometric tokenizer, but stronger length control and bounded edit trajectories under 100 ms shifts.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That optimizing cross-view sequence likelihood with unrelated negatives as contrast produces token sequences whose edit-distance properties generalize to downstream tasks without direct supervision on those properties.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"PairAlign learns compact audio token sequences via self-alignment of paired content views using an autoregressive decoder, achieving strong cross-view consistency and edit-distance preservation while reducing token count by 55% on TIMIT.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"PairAlign generates compact audio token sequences by training each view's output to be likely under the other's encoder while contrasting unrelated examples.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"8102ba5b0f9d90d72fe37e9fc8078dfd3661a9535097e6aa626d7a2e54714bc7"},"source":{"id":"2605.06582","kind":"arxiv","version":2},"verdict":{"id":"c9075e1a-d981-4fad-b957-6f1d93c6cb24","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-08T12:22:30.168198Z","strongest_claim":"On TIMIT retrieval, it preserves edit-distance search while reducing archive token count by 55%. A continuous-sweep probe shows lower local overlap than a dense geometric tokenizer, but stronger length control and bounded edit trajectories under 100 ms shifts.","one_line_summary":"PairAlign learns compact audio token sequences via self-alignment of paired content views using an autoregressive decoder, achieving strong cross-view consistency and edit-distance preservation while reducing token count by 55% on TIMIT.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That optimizing cross-view sequence likelihood with unrelated negatives as contrast produces token sequences whose edit-distance properties generalize to downstream tasks without direct supervision on those properties.","pith_extraction_headline":"PairAlign generates compact audio token sequences by training each view's output to be likely under the other's encoder while contrasting unrelated examples."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.06582/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"claim_evidence","ran_at":"2026-05-20T12:22:03.804268Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-20T07:38:18.676923Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_title_agreement","ran_at":"2026-05-19T18:01:19.445017Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T12:34:45.422625Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"a02e5e476a74025fb4d82f26db601686d952f22bd78183d571b9dbd94bb3f4ec"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"c9075e1a-d981-4fad-b957-6f1d93c6cb24"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-09T02:07:28Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"RtEdta4jf36gO/jCllsGuvLzw8IY3Dciqu0uLlkfOI+Egf1qjr6b+bWaneOHp/7C1M2ZSO8wDUQcAc71Az7lAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T15:03:36.900862Z"},"content_sha256":"e37735b9f553cfe70abb5403fcc3e121b3163beb0ecba8fdee78c7c2271f54e2","schema_version":"1.0","event_id":"sha256:e37735b9f553cfe70abb5403fcc3e121b3163beb0ecba8fdee78c7c2271f54e2"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/GMH4UAMFDXWDWTBEAYXTDM7KOR/bundle.json","state_url":"https://pith.science/pith/GMH4UAMFDXWDWTBEAYXTDM7KOR/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/GMH4UAMFDXWDWTBEAYXTDM7KOR/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-10T15:03:36Z","links":{"resolver":"https://pith.science/pith/GMH4UAMFDXWDWTBEAYXTDM7KOR","bundle":"https://pith.science/pith/GMH4UAMFDXWDWTBEAYXTDM7KOR/bundle.json","state":"https://pith.science/pith/GMH4UAMFDXWDWTBEAYXTDM7KOR/state.json","well_known_bundle":"https://pith.science/.well-known/pith/GMH4UAMFDXWDWTBEAYXTDM7KOR/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:GMH4UAMFDXWDWTBEAYXTDM7KOR","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7b681725430bf02f2e528276b1172415e2a4c1fc77a25c71711f1db6a0af57e6","cross_cats_sorted":["cs.CL","cs.SD"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:11:22Z","title_canon_sha256":"59807b35e32d4a73ebd341180dead6b1f03cafad47f37df62a6354572ca93743"},"schema_version":"1.0","source":{"id":"2605.06582","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.06582","created_at":"2026-06-09T02:07:28Z"},{"alias_kind":"arxiv_version","alias_value":"2605.06582v2","created_at":"2026-06-09T02:07:28Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.06582","created_at":"2026-06-09T02:07:28Z"},{"alias_kind":"pith_short_12","alias_value":"GMH4UAMFDXWD","created_at":"2026-06-09T02:07:28Z"},{"alias_kind":"pith_short_16","alias_value":"GMH4UAMFDXWDWTBE","created_at":"2026-06-09T02:07:28Z"},{"alias_kind":"pith_short_8","alias_value":"GMH4UAMF","created_at":"2026-06-09T02:07:28Z"}],"graph_snapshots":[{"event_id":"sha256:e37735b9f553cfe70abb5403fcc3e121b3163beb0ecba8fdee78c7c2271f54e2","target":"graph","created_at":"2026-06-09T02:07:28Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"On TIMIT retrieval, it preserves edit-distance search while reducing archive token count by 55%. A continuous-sweep probe shows lower local overlap than a dense geometric tokenizer, but stronger length control and bounded edit trajectories under 100 ms shifts."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That optimizing cross-view sequence likelihood with unrelated negatives as contrast produces token sequences whose edit-distance properties generalize to downstream tasks without direct supervision on those properties."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"PairAlign learns compact audio token sequences via self-alignment of paired content views using an autoregressive decoder, achieving strong cross-view consistency and edit-distance preservation while reducing token count by 55% on TIMIT."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"PairAlign generates compact audio token sequences by training each view's output to be likely under the other's encoder while contrasting unrelated examples."}],"snapshot_sha256":"8102ba5b0f9d90d72fe37e9fc8078dfd3661a9535097e6aa626d7a2e54714bc7"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"claim_evidence","ran_at":"2026-05-20T12:22:03.804268Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-20T07:38:18.676923Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T18:01:19.445017Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T12:34:45.422625Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2605.06582/integrity.json","findings":[],"snapshot_sha256":"a02e5e476a74025fb4d82f26db601686d952f22bd78183d571b9dbd94bb3f4ec","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Many operations on sensory data -- comparison, memory, retrieval, and reasoning -- are naturally expressed over discrete symbolic structures. In language this interface is given by tokens; in audio, it must be learned. Existing audio tokenizers rely on quantization, clustering, or codec reconstruction, assigning tokens locally, so sequence consistency, compactness, length control, termination, and edit similarity are rarely optimized directly.\n  We introduce PairAlign, a framework for compact audio tokenization through sequence-level self-alignment. PairAlign treats tokenization as conditional","authors_text":"Adhiraj Banerjee, Vipul Arora","cross_cats":["cs.CL","cs.SD"],"headline":"PairAlign generates compact audio token sequences by training each view's output to be likely under the other's encoder while contrasting unrelated examples.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:11:22Z","title":"PairAlign: A Framework for Sequence Tokenization via Self-Alignment with Applications to Audio Tokenization"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.06582","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-08T12:22:30.168198Z","id":"c9075e1a-d981-4fad-b957-6f1d93c6cb24","model_set":{"reader":"grok-4.3"},"one_line_summary":"PairAlign learns compact audio token sequences via self-alignment of paired content views using an autoregressive decoder, achieving strong cross-view consistency and edit-distance preservation while reducing token count by 55% on TIMIT.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"PairAlign generates compact audio token sequences by training each view's output to be likely under the other's encoder while contrasting unrelated examples.","strongest_claim":"On TIMIT retrieval, it preserves edit-distance search while reducing archive token count by 55%. A continuous-sweep probe shows lower local overlap than a dense geometric tokenizer, but stronger length control and bounded edit trajectories under 100 ms shifts.","weakest_assumption":"That optimizing cross-view sequence likelihood with unrelated negatives as contrast produces token sequences whose edit-distance properties generalize to downstream tasks without direct supervision on those properties."}},"verdict_id":"c9075e1a-d981-4fad-b957-6f1d93c6cb24"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a894e08f9374646ca6a3f115edf10387d7ee371a23e85f8baf64771ea278e285","target":"record","created_at":"2026-06-09T02:07:28Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7b681725430bf02f2e528276b1172415e2a4c1fc77a25c71711f1db6a0af57e6","cross_cats_sorted":["cs.CL","cs.SD"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:11:22Z","title_canon_sha256":"59807b35e32d4a73ebd341180dead6b1f03cafad47f37df62a6354572ca93743"},"schema_version":"1.0","source":{"id":"2605.06582","kind":"arxiv","version":2}},"canonical_sha256":"330fca01851dec3b4c24062f31b3ea745f2547c7a9d3c564477ee7730ed16628","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"330fca01851dec3b4c24062f31b3ea745f2547c7a9d3c564477ee7730ed16628","first_computed_at":"2026-06-09T02:07:28.588832Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-09T02:07:28.588832Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"PutzwJefed65KY7VS4un6zRuo06cPZjKpKen/0sxW6rx+FGwYWQUvBAjPvbmW/ei4qUVe+OkgL7jIsCirmXQAA==","signature_status":"signed_v1","signed_at":"2026-06-09T02:07:28.589649Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.06582","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a894e08f9374646ca6a3f115edf10387d7ee371a23e85f8baf64771ea278e285","sha256:e37735b9f553cfe70abb5403fcc3e121b3163beb0ecba8fdee78c7c2271f54e2"],"state_sha256":"b0833cddcae7ce613269927c329b257346aa33ad656ed6020e34d7aaaab1d3dd"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"N7ZjgbVLSEKJ8T3zLTkJJluD0aRCMtOR3gEUhF3ohGRWyW1x720slG0swjfGg6FYo41zlOiFDCxaFmZ4m53RAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-10T15:03:36.905606Z","bundle_sha256":"de70ec5d08a589198481c517a56340d0d4c1e27cb470436a4fa9e58d9bbc2b87"}}