{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:EIQJJ4XWI4KAFI3M6JBEF5M44Z","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5a796fd442792dd585c6372960d188aea1fb81a5273d71638182b88600ea5c08","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2026-05-15T10:35:49Z","title_canon_sha256":"ec5f07fdea99f6f7f8c5ec81374e48a37f6d08a0f354945307c1c131a428c9f1"},"schema_version":"1.0","source":{"id":"2605.15831","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15831","created_at":"2026-05-20T00:01:20Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15831v1","created_at":"2026-05-20T00:01:20Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.15831","created_at":"2026-05-20T00:01:20Z"},{"alias_kind":"pith_short_12","alias_value":"EIQJJ4XWI4KA","created_at":"2026-05-20T00:01:20Z"},{"alias_kind":"pith_short_16","alias_value":"EIQJJ4XWI4KAFI3M","created_at":"2026-05-20T00:01:20Z"},{"alias_kind":"pith_short_8","alias_value":"EIQJJ4XW","created_at":"2026-05-20T00:01:20Z"}],"graph_snapshots":[{"event_id":"sha256:df0e697b9a06f0b6a8d31462e54c8e26ce8010ca123f713d30945617d1d8eb09","target":"graph","created_at":"2026-05-20T00:01:20Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"BandTok yields a physically interpretable time-frequency token grid with a more independent token structure, making it better suited for autoregressive modeling than residual-codebook tokenizers."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The residual hierarchy in existing high-fidelity codecs imposes strong sequential dependencies that amplify error accumulation during autoregressive generation after sequence flattening; the single shared codebook in BandTok avoids this while preserving reconstruction quality."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"BandTok tokenizes Mel-spectrograms as independent time-frequency band tokens from a single codebook and pairs it with 2D RoPE in an autoregressive model to improve music generation over residual multi-codebook tokenizers."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"BandTok turns music into a 2D time-frequency token grid from a single shared codebook, reducing sequential dependencies for autoregressive generation."}],"snapshot_sha256":"5da5a4f6b79c9ba79d73cdff9a2787e65b0287d19813e292b3b28f4583992545"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T19:01:19.007650Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T18:52:03.665460Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-19T17:33:48.719292Z","status":"skipped","version":"1.0.0"},{"findings_count":0,"name":"claim_evidence","ran_at":"2026-05-19T17:21:55.858488Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2605.15831/integrity.json","findings":[],"snapshot_sha256":"ea0e2a49401d112f15464e7410639a4886e168baf4e0a64a51ea9640f4b337db","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Autoregressive music generation depends strongly on the audio tokenizer. Existing high-fidelity codecs often use residual multi-codebook quantization, which preserves reconstruction quality but complicates language modeling after sequence flattening, as the residual hierarchy imposes strong sequential dependencies and can amplify error accumulation. We propose BandTok, a generation-oriented 2D Mel-spectrogram tokenizer that represents each frame with Mel-frequency band tokens from a single shared codebook. This design yields a physically interpretable time-frequency token grid with a more inde","authors_text":"Guochen Yu, Xiaotao Gu, Xingyu Ma, Yuqing Cheng","cross_cats":["cs.AI"],"headline":"BandTok turns music into a 2D time-frequency token grid from a single shared codebook, reducing sequential dependencies for autoregressive generation.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2026-05-15T10:35:49Z","title":"Modeling Music as a Time-Frequency Image: A 2D Tokenizer for Music Generation"},"references":{"count":33,"internal_anchors":6,"resolved_work":33,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Soundstream: An end-to-end neural audio codec,","work_id":"9fbb792b-e036-44f0-b2e7-5f5b614d0f9d","year":2021},{"cited_arxiv_id":"2210.13438","doi":"","is_internal_anchor":true,"ref_index":2,"title":"High Fidelity Neural Audio Compression","work_id":"bc645d2d-e9f2-4cb8-9a6d-bd557bc7a258","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"High-fidelity audio compression with improved rvqgan,","work_id":"19f9e2a7-acb1-4a5b-b00a-04e3d9f80a41","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Audiolm: a language modeling approach to audio generation,","work_id":"bd60205e-fea4-4469-841c-44cf3c04ac71","year":2023},{"cited_arxiv_id":"2301.11325","doi":"","is_internal_anchor":true,"ref_index":5,"title":"MusicLM: Generating Music From Text","work_id":"15e6566e-1c36-468f-966e-823248cbf87f","year":2023}],"snapshot_sha256":"dc66c0399d6ca420ef5144fce57dfb62ab236176c887334318f4249a1b90521f"},"source":{"id":"2605.15831","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-19T18:42:43.376094Z","id":"55b822d9-f159-452d-bb0a-5bedd8f9de6d","model_set":{"reader":"grok-4.3"},"one_line_summary":"BandTok tokenizes Mel-spectrograms as independent time-frequency band tokens from a single codebook and pairs it with 2D RoPE in an autoregressive model to improve music generation over residual multi-codebook tokenizers.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"BandTok turns music into a 2D time-frequency token grid from a single shared codebook, reducing sequential dependencies for autoregressive generation.","strongest_claim":"BandTok yields a physically interpretable time-frequency token grid with a more independent token structure, making it better suited for autoregressive modeling than residual-codebook tokenizers.","weakest_assumption":"The residual hierarchy in existing high-fidelity codecs imposes strong sequential dependencies that amplify error accumulation during autoregressive generation after sequence flattening; the single shared codebook in BandTok avoids this while preserving reconstruction quality."}},"verdict_id":"55b822d9-f159-452d-bb0a-5bedd8f9de6d"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:dd891f2010f53ee4a2e53683fc2d05cb9e6333bd32767dfe385eba628016eedd","target":"record","created_at":"2026-05-20T00:01:20Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5a796fd442792dd585c6372960d188aea1fb81a5273d71638182b88600ea5c08","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2026-05-15T10:35:49Z","title_canon_sha256":"ec5f07fdea99f6f7f8c5ec81374e48a37f6d08a0f354945307c1c131a428c9f1"},"schema_version":"1.0","source":{"id":"2605.15831","kind":"arxiv","version":1}},"canonical_sha256":"222094f2f6471402a36cf24242f59ce6789cb804c2f7c74151660059a3ecc68c","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"222094f2f6471402a36cf24242f59ce6789cb804c2f7c74151660059a3ecc68c","first_computed_at":"2026-05-20T00:01:20.748003Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:01:20.748003Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"LwaMq+V4sSzRazWP6/fL2ts8/YjnYvmdSeOQuE7qghMjwK5pfyd0eHhNQasLLBz6X6Du0cFJYG8Buvbs8fYwAQ==","signature_status":"signed_v1","signed_at":"2026-05-20T00:01:20.748743Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.15831","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:dd891f2010f53ee4a2e53683fc2d05cb9e6333bd32767dfe385eba628016eedd","sha256:df0e697b9a06f0b6a8d31462e54c8e26ce8010ca123f713d30945617d1d8eb09"],"state_sha256":"91618106a2ecc74c52c9ceac115c39fd0f00aec9374719e500d5217da59b7d69"}