{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:36DVMUZE3NW34S5T2DFB56FQE6","short_pith_number":"pith:36DVMUZE","canonical_record":{"source":{"id":"2605.13429","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T12:23:24Z","cross_cats_sorted":[],"title_canon_sha256":"c606c48b7c7bd35750f71889f07f8311eec096732906fb2ef2bb049199bbd750","abstract_canon_sha256":"7a6d459d6f44f508c222989d94abf30fa72714945586184c792f92a59b0e697f"},"schema_version":"1.0"},"canonical_sha256":"df87565324db6dbe4bb3d0ca1ef8b027970dc678a49d364ea8e7727e14dcfbc3","source":{"kind":"arxiv","id":"2605.13429","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13429","created_at":"2026-05-18T02:44:47Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13429v1","created_at":"2026-05-18T02:44:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13429","created_at":"2026-05-18T02:44:47Z"},{"alias_kind":"pith_short_12","alias_value":"36DVMUZE3NW3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"36DVMUZE3NW34S5T","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"36DVMUZE","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:36DVMUZE3NW34S5T2DFB56FQE6","target":"record","payload":{"canonical_record":{"source":{"id":"2605.13429","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T12:23:24Z","cross_cats_sorted":[],"title_canon_sha256":"c606c48b7c7bd35750f71889f07f8311eec096732906fb2ef2bb049199bbd750","abstract_canon_sha256":"7a6d459d6f44f508c222989d94abf30fa72714945586184c792f92a59b0e697f"},"schema_version":"1.0"},"canonical_sha256":"df87565324db6dbe4bb3d0ca1ef8b027970dc678a49d364ea8e7727e14dcfbc3","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:44:47.216275Z","signature_b64":"0XvVWhe+xFVTr2gIeM8w5LK1IA1KRkpjN+pczPtQx40vDV89GMxlLlNkmwe0a1BDHPlAwQaHbgyRlsGJ8QhwAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"df87565324db6dbe4bb3d0ca1ef8b027970dc678a49d364ea8e7727e14dcfbc3","last_reissued_at":"2026-05-18T02:44:47.215788Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:44:47.215788Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.13429","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:44:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"1x7NTRg6olIXP3HAKw6291BlDYzzzc2N3kL3ANc5zNeZGFHXZwcafw/1MZ7InqYqLEMggMFUFET4OvOdHDhfDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T22:33:35.848245Z"},"content_sha256":"9725ed95745d49f21686f4f0b5f334366e155dc761fb592ac0f84a985dceff1c","schema_version":"1.0","event_id":"sha256:9725ed95745d49f21686f4f0b5f334366e155dc761fb592ac0f84a985dceff1c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:36DVMUZE3NW34S5T2DFB56FQE6","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"TokAlign++: Advancing Vocabulary Adaptation via Better Token Alignment","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"By learning bilingual token alignments from monolingual representations, TokAlign++ rearranges parameters to adapt LLM vocabularies while preserving performance and boosting compression.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Chengqing Zong, Chong Li, Jiajun Zhang, Wen Yang, Yingzhuo Deng","submitted_at":"2026-05-13T12:23:24Z","abstract_excerpt":"Tokenization is a foundational step in the text process of Large Language Models (LLMs). Texts must be first tokenized into token IDs, which are then input to LLMs. Inefficient tokenization results in long token-ID sequences and will slow down the training and inference of LLMs. The fine-grained knowledge transfer between LLMs, like token-level distillation, is also impeded by the mismatch in vocabulary. To bridge this gap, we introduce a method named TokAlign++ to improve vocabulary adaptation performance by learning better token alignment lexicon. The source and target vocabularies are taken"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experimental results on 15 languages show that our method boosts the multilingual text compression rates and preserves most of the multilingual ability of vanilla models. It costs as few as 1k steps to restore the performance of the vanilla model. After unifying vocabularies between vanilla models, token-level distillation remarkably improves the base model with only 235M tokens.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that a bilingual token alignment lexicon learned solely from monolingual token representations will provide accurate enough mappings to allow parameter rearrangement and progressive fine-tuning to succeed with only minor performance loss.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"TokAlign++ learns token alignments between LLM vocabularies from monolingual representations to enable faster adaptation, better text compression, and effective token-level distillation across 15 languages with minimal steps.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"By learning bilingual token alignments from monolingual representations, TokAlign++ rearranges parameters to adapt LLM vocabularies while preserving performance and boosting compression.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"38f4e49cef3530ecf64a892fed5705719c81bac28dbf287983a4ad8c476b79e5"},"source":{"id":"2605.13429","kind":"arxiv","version":1},"verdict":{"id":"3930e7ee-a82a-48ac-bf51-c1d02b180c93","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T19:30:54.808410Z","strongest_claim":"Experimental results on 15 languages show that our method boosts the multilingual text compression rates and preserves most of the multilingual ability of vanilla models. It costs as few as 1k steps to restore the performance of the vanilla model. After unifying vocabularies between vanilla models, token-level distillation remarkably improves the base model with only 235M tokens.","one_line_summary":"TokAlign++ learns token alignments between LLM vocabularies from monolingual representations to enable faster adaptation, better text compression, and effective token-level distillation across 15 languages with minimal steps.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that a bilingual token alignment lexicon learned solely from monolingual token representations will provide accurate enough mappings to allow parameter rearrangement and progressive fine-tuning to succeed with only minor performance loss.","pith_extraction_headline":"By learning bilingual token alignments from monolingual representations, TokAlign++ rearranges parameters to adapt LLM vocabularies while preserving performance and boosting compression."},"references":{"count":136,"sample":[{"doi":"","year":null,"title":"This is an example of sample bibitem article title , journal =","work_id":"df456ca0-0399-45b1-825d-cb87d223b3b2","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"This is an example of sample bibitem article title , booktitle =","work_id":"076627da-b2d6-4cd5-b8d0-d7778490fef4","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Scaling Learning Algorithms Towards","work_id":"bb2761cc-98d0-411b-92f6-803773d64460","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"and Osindero, Simon and Teh, Yee Whye , journal =","work_id":"0a5921e3-ac4e-46f1-85ae-866119a87be0","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2016,"title":"Deep learning , author=. 2016 , publisher=","work_id":"cf0899e0-53ee-4591-aae4-f38fa5ac12ad","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":136,"snapshot_sha256":"f50a10555acce5f461a936a894299ad41d49ea193efe09fd7c573171598d6a2e","internal_anchors":6},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"3930e7ee-a82a-48ac-bf51-c1d02b180c93"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:44:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Xi9bLYhTfjHzPp1gjK8udioAW9NcEay2nWFrbMs+rwLynptOY/50YLszEo4JKzAngQT1IiWOTpAxh8OgUaqBCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T22:33:35.854160Z"},"content_sha256":"310cf76918477cc1e840ccb091f9aca1276fa52a547e244d2fb88b40b6e395ec","schema_version":"1.0","event_id":"sha256:310cf76918477cc1e840ccb091f9aca1276fa52a547e244d2fb88b40b6e395ec"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/36DVMUZE3NW34S5T2DFB56FQE6/bundle.json","state_url":"https://pith.science/pith/36DVMUZE3NW34S5T2DFB56FQE6/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/36DVMUZE3NW34S5T2DFB56FQE6/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-10T22:33:35Z","links":{"resolver":"https://pith.science/pith/36DVMUZE3NW34S5T2DFB56FQE6","bundle":"https://pith.science/pith/36DVMUZE3NW34S5T2DFB56FQE6/bundle.json","state":"https://pith.science/pith/36DVMUZE3NW34S5T2DFB56FQE6/state.json","well_known_bundle":"https://pith.science/.well-known/pith/36DVMUZE3NW34S5T2DFB56FQE6/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:36DVMUZE3NW34S5T2DFB56FQE6","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7a6d459d6f44f508c222989d94abf30fa72714945586184c792f92a59b0e697f","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T12:23:24Z","title_canon_sha256":"c606c48b7c7bd35750f71889f07f8311eec096732906fb2ef2bb049199bbd750"},"schema_version":"1.0","source":{"id":"2605.13429","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13429","created_at":"2026-05-18T02:44:47Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13429v1","created_at":"2026-05-18T02:44:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13429","created_at":"2026-05-18T02:44:47Z"},{"alias_kind":"pith_short_12","alias_value":"36DVMUZE3NW3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"36DVMUZE3NW34S5T","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"36DVMUZE","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:310cf76918477cc1e840ccb091f9aca1276fa52a547e244d2fb88b40b6e395ec","target":"graph","created_at":"2026-05-18T02:44:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experimental results on 15 languages show that our method boosts the multilingual text compression rates and preserves most of the multilingual ability of vanilla models. It costs as few as 1k steps to restore the performance of the vanilla model. After unifying vocabularies between vanilla models, token-level distillation remarkably improves the base model with only 235M tokens."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that a bilingual token alignment lexicon learned solely from monolingual token representations will provide accurate enough mappings to allow parameter rearrangement and progressive fine-tuning to succeed with only minor performance loss."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"TokAlign++ learns token alignments between LLM vocabularies from monolingual representations to enable faster adaptation, better text compression, and effective token-level distillation across 15 languages with minimal steps."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"By learning bilingual token alignments from monolingual representations, TokAlign++ rearranges parameters to adapt LLM vocabularies while preserving performance and boosting compression."}],"snapshot_sha256":"38f4e49cef3530ecf64a892fed5705719c81bac28dbf287983a4ad8c476b79e5"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Tokenization is a foundational step in the text process of Large Language Models (LLMs). Texts must be first tokenized into token IDs, which are then input to LLMs. Inefficient tokenization results in long token-ID sequences and will slow down the training and inference of LLMs. The fine-grained knowledge transfer between LLMs, like token-level distillation, is also impeded by the mismatch in vocabulary. To bridge this gap, we introduce a method named TokAlign++ to improve vocabulary adaptation performance by learning better token alignment lexicon. The source and target vocabularies are taken","authors_text":"Chengqing Zong, Chong Li, Jiajun Zhang, Wen Yang, Yingzhuo Deng","cross_cats":[],"headline":"By learning bilingual token alignments from monolingual representations, TokAlign++ rearranges parameters to adapt LLM vocabularies while preserving performance and boosting compression.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T12:23:24Z","title":"TokAlign++: Advancing Vocabulary Adaptation via Better Token Alignment"},"references":{"count":136,"internal_anchors":6,"resolved_work":136,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"This is an example of sample bibitem article title , journal =","work_id":"df456ca0-0399-45b1-825d-cb87d223b3b2","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"This is an example of sample bibitem article title , booktitle =","work_id":"076627da-b2d6-4cd5-b8d0-d7778490fef4","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Scaling Learning Algorithms Towards","work_id":"bb2761cc-98d0-411b-92f6-803773d64460","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"and Osindero, Simon and Teh, Yee Whye , journal =","work_id":"0a5921e3-ac4e-46f1-85ae-866119a87be0","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Deep learning , author=. 2016 , publisher=","work_id":"cf0899e0-53ee-4591-aae4-f38fa5ac12ad","year":2016}],"snapshot_sha256":"f50a10555acce5f461a936a894299ad41d49ea193efe09fd7c573171598d6a2e"},"source":{"id":"2605.13429","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T19:30:54.808410Z","id":"3930e7ee-a82a-48ac-bf51-c1d02b180c93","model_set":{"reader":"grok-4.3"},"one_line_summary":"TokAlign++ learns token alignments between LLM vocabularies from monolingual representations to enable faster adaptation, better text compression, and effective token-level distillation across 15 languages with minimal steps.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"By learning bilingual token alignments from monolingual representations, TokAlign++ rearranges parameters to adapt LLM vocabularies while preserving performance and boosting compression.","strongest_claim":"Experimental results on 15 languages show that our method boosts the multilingual text compression rates and preserves most of the multilingual ability of vanilla models. It costs as few as 1k steps to restore the performance of the vanilla model. After unifying vocabularies between vanilla models, token-level distillation remarkably improves the base model with only 235M tokens.","weakest_assumption":"The assumption that a bilingual token alignment lexicon learned solely from monolingual token representations will provide accurate enough mappings to allow parameter rearrangement and progressive fine-tuning to succeed with only minor performance loss."}},"verdict_id":"3930e7ee-a82a-48ac-bf51-c1d02b180c93"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:9725ed95745d49f21686f4f0b5f334366e155dc761fb592ac0f84a985dceff1c","target":"record","created_at":"2026-05-18T02:44:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7a6d459d6f44f508c222989d94abf30fa72714945586184c792f92a59b0e697f","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T12:23:24Z","title_canon_sha256":"c606c48b7c7bd35750f71889f07f8311eec096732906fb2ef2bb049199bbd750"},"schema_version":"1.0","source":{"id":"2605.13429","kind":"arxiv","version":1}},"canonical_sha256":"df87565324db6dbe4bb3d0ca1ef8b027970dc678a49d364ea8e7727e14dcfbc3","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"df87565324db6dbe4bb3d0ca1ef8b027970dc678a49d364ea8e7727e14dcfbc3","first_computed_at":"2026-05-18T02:44:47.215788Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:44:47.215788Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"0XvVWhe+xFVTr2gIeM8w5LK1IA1KRkpjN+pczPtQx40vDV89GMxlLlNkmwe0a1BDHPlAwQaHbgyRlsGJ8QhwAA==","signature_status":"signed_v1","signed_at":"2026-05-18T02:44:47.216275Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13429","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:9725ed95745d49f21686f4f0b5f334366e155dc761fb592ac0f84a985dceff1c","sha256:310cf76918477cc1e840ccb091f9aca1276fa52a547e244d2fb88b40b6e395ec"],"state_sha256":"fa5b813bc30bed70c52f71ac8f335f9215d170e767aac9f930e178741d45c188"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"I4eYAFRIbCU/RzJo4GmyDdPXGh56BK58lAqt83TjYIVCvt0wdtKWtBoK2+DAIrPooit8N0dm7jK+jhgAuqmfBw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-10T22:33:35.859116Z","bundle_sha256":"d2283bace3586006886070a765ad655b694c107f336a266ef186abfbbb41ea20"}}