{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:YK5CILY5R3OBKUTLDZRQ6XEY2S","short_pith_number":"pith:YK5CILY5","canonical_record":{"source":{"id":"2605.01188","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-02T01:53:22Z","cross_cats_sorted":[],"title_canon_sha256":"2ba189736d0694a11dca2c844800d19f7f4888957472955e501cd91a3fcce290","abstract_canon_sha256":"e7fe2f567d44a77bb36cdea77cec8567704cde4940c5a000ac85c1215ff375d4"},"schema_version":"1.0"},"canonical_sha256":"c2ba242f1d8edc15526b1e630f5c98d48a21f4d659242a900ec11432447ec65e","source":{"kind":"arxiv","id":"2605.01188","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.01188","created_at":"2026-05-27T02:06:14Z"},{"alias_kind":"arxiv_version","alias_value":"2605.01188v2","created_at":"2026-05-27T02:06:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.01188","created_at":"2026-05-27T02:06:14Z"},{"alias_kind":"pith_short_12","alias_value":"YK5CILY5R3OB","created_at":"2026-05-27T02:06:14Z"},{"alias_kind":"pith_short_16","alias_value":"YK5CILY5R3OBKUTL","created_at":"2026-05-27T02:06:14Z"},{"alias_kind":"pith_short_8","alias_value":"YK5CILY5","created_at":"2026-05-27T02:06:14Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:YK5CILY5R3OBKUTLDZRQ6XEY2S","target":"record","payload":{"canonical_record":{"source":{"id":"2605.01188","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-02T01:53:22Z","cross_cats_sorted":[],"title_canon_sha256":"2ba189736d0694a11dca2c844800d19f7f4888957472955e501cd91a3fcce290","abstract_canon_sha256":"e7fe2f567d44a77bb36cdea77cec8567704cde4940c5a000ac85c1215ff375d4"},"schema_version":"1.0"},"canonical_sha256":"c2ba242f1d8edc15526b1e630f5c98d48a21f4d659242a900ec11432447ec65e","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T02:06:14.187623Z","signature_b64":"DZG9GuqxuvDdZWch50OBdGaElfeT74BpZJN0sys54UX9rz+PK9ZwrctPx1dvv+fgdp/Vae0rMugSf0UjA3u1AQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c2ba242f1d8edc15526b1e630f5c98d48a21f4d659242a900ec11432447ec65e","last_reissued_at":"2026-05-27T02:06:14.186783Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T02:06:14.186783Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.01188","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T02:06:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"DbGHJaCYLQ9cUFwCjErrQ7+FJjsu7gOPDb/4iAcvq+0N8QNErp1zhycLdfLhjAM3gLTWmFLwAmuxB2pjWSAbBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T21:24:01.739258Z"},"content_sha256":"2d828196424d66b54755e998940dc3e74b0362094f94c16f1dcb30a79b647f9d","schema_version":"1.0","event_id":"sha256:2d828196424d66b54755e998940dc3e74b0362094f94c16f1dcb30a79b647f9d"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:YK5CILY5R3OBKUTLDZRQ6XEY2S","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Compute Optimal Tokenization","license":"http://creativecommons.org/licenses/by/4.0/","headline":"In compute-optimal regimes, language model parameter counts scale with the byte volume of data rather than the number of tokens.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Alisa Liu, Artidoro Pagnoni, Gargi Ghosh, Luke Zettlemoyer, Margaret Li, Mike Lewis, Sachin Mehta, Srini Iyer, Tomasz Limisiewicz","submitted_at":"2026-05-02T01:53:22Z","abstract_excerpt":"Scaling laws enable the optimal selection of data amount and language model size, yet the impact of the data unit, the token, on this relationship remains underexplored. In this work, we systematically investigate how the information granularity of tokens, controlled by the compression rate (i.e., average bytes of text per token), affects scaling trends. We train 988 latent tokenized models (BLT) ranging from 50M to 7B parameters that enable setting the desired compression rate. This flexibility allows us to study the role of compression rate well beyond 4.57 bytes per token obtained with a po"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"in compute-optimal configurations, model parameter counts scale proportionally to data size measured in bytes, not in tokens as commonly perceived (Kaplan et al., 2020; Hoffmann et al., 2022)","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the behavior of latent tokenized BLT models generalizes to standard subword tokenizers and that the observed scaling trends extend beyond the tested range up to 7B parameters.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Compute-optimal language models require parameter count to scale with data bytes rather than tokens, with optimal token compression rate decreasing as compute budget grows.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"In compute-optimal regimes, language model parameter counts scale with the byte volume of data rather than the number of tokens.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"ea428ebb176e88d06b95d3e0ffb4a575c2057d3ca5213099ac14a7ae9cd007ed"},"source":{"id":"2605.01188","kind":"arxiv","version":2},"verdict":{"id":"3a2d3b34-1c92-453d-a8bc-f7abbcb3ba6e","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-09T15:23:12.731110Z","strongest_claim":"in compute-optimal configurations, model parameter counts scale proportionally to data size measured in bytes, not in tokens as commonly perceived (Kaplan et al., 2020; Hoffmann et al., 2022)","one_line_summary":"Compute-optimal language models require parameter count to scale with data bytes rather than tokens, with optimal token compression rate decreasing as compute budget grows.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the behavior of latent tokenized BLT models generalizes to standard subword tokenizers and that the observed scaling trends extend beyond the tested range up to 7B parameters.","pith_extraction_headline":"In compute-optimal regimes, language model parameter counts scale with the byte volume of data rather than the number of tokens."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.01188/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"ai_meta_artifact","ran_at":"2026-05-20T18:37:03.541597Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T17:30:51.611300Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"5a98e956d31332ade85e89e2103c43cb57a93dba5cc32afe2e77559643e52dd8"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"3a2d3b34-1c92-453d-a8bc-f7abbcb3ba6e"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T02:06:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"/LegNe3Oj5x7zWGJ55nj6f2Jr+APjxmUq860gOk9oxV+qfchzbKb0QUbpNwMowS/Mue/4gRNIXZu1rqolpPlDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T21:24:01.740194Z"},"content_sha256":"b4298431210dc71148e8025f6bda79ed27ee15985047b724de64b09be1fb3cc4","schema_version":"1.0","event_id":"sha256:b4298431210dc71148e8025f6bda79ed27ee15985047b724de64b09be1fb3cc4"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/YK5CILY5R3OBKUTLDZRQ6XEY2S/bundle.json","state_url":"https://pith.science/pith/YK5CILY5R3OBKUTLDZRQ6XEY2S/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/YK5CILY5R3OBKUTLDZRQ6XEY2S/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T21:24:01Z","links":{"resolver":"https://pith.science/pith/YK5CILY5R3OBKUTLDZRQ6XEY2S","bundle":"https://pith.science/pith/YK5CILY5R3OBKUTLDZRQ6XEY2S/bundle.json","state":"https://pith.science/pith/YK5CILY5R3OBKUTLDZRQ6XEY2S/state.json","well_known_bundle":"https://pith.science/.well-known/pith/YK5CILY5R3OBKUTLDZRQ6XEY2S/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:YK5CILY5R3OBKUTLDZRQ6XEY2S","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"e7fe2f567d44a77bb36cdea77cec8567704cde4940c5a000ac85c1215ff375d4","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-02T01:53:22Z","title_canon_sha256":"2ba189736d0694a11dca2c844800d19f7f4888957472955e501cd91a3fcce290"},"schema_version":"1.0","source":{"id":"2605.01188","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.01188","created_at":"2026-05-27T02:06:14Z"},{"alias_kind":"arxiv_version","alias_value":"2605.01188v2","created_at":"2026-05-27T02:06:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.01188","created_at":"2026-05-27T02:06:14Z"},{"alias_kind":"pith_short_12","alias_value":"YK5CILY5R3OB","created_at":"2026-05-27T02:06:14Z"},{"alias_kind":"pith_short_16","alias_value":"YK5CILY5R3OBKUTL","created_at":"2026-05-27T02:06:14Z"},{"alias_kind":"pith_short_8","alias_value":"YK5CILY5","created_at":"2026-05-27T02:06:14Z"}],"graph_snapshots":[{"event_id":"sha256:b4298431210dc71148e8025f6bda79ed27ee15985047b724de64b09be1fb3cc4","target":"graph","created_at":"2026-05-27T02:06:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"in compute-optimal configurations, model parameter counts scale proportionally to data size measured in bytes, not in tokens as commonly perceived (Kaplan et al., 2020; Hoffmann et al., 2022)"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the behavior of latent tokenized BLT models generalizes to standard subword tokenizers and that the observed scaling trends extend beyond the tested range up to 7B parameters."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Compute-optimal language models require parameter count to scale with data bytes rather than tokens, with optimal token compression rate decreasing as compute budget grows."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"In compute-optimal regimes, language model parameter counts scale with the byte volume of data rather than the number of tokens."}],"snapshot_sha256":"ea428ebb176e88d06b95d3e0ffb4a575c2057d3ca5213099ac14a7ae9cd007ed"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-20T18:37:03.541597Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T17:30:51.611300Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2605.01188/integrity.json","findings":[],"snapshot_sha256":"5a98e956d31332ade85e89e2103c43cb57a93dba5cc32afe2e77559643e52dd8","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Scaling laws enable the optimal selection of data amount and language model size, yet the impact of the data unit, the token, on this relationship remains underexplored. In this work, we systematically investigate how the information granularity of tokens, controlled by the compression rate (i.e., average bytes of text per token), affects scaling trends. We train 988 latent tokenized models (BLT) ranging from 50M to 7B parameters that enable setting the desired compression rate. This flexibility allows us to study the role of compression rate well beyond 4.57 bytes per token obtained with a po","authors_text":"Alisa Liu, Artidoro Pagnoni, Gargi Ghosh, Luke Zettlemoyer, Margaret Li, Mike Lewis, Sachin Mehta, Srini Iyer, Tomasz Limisiewicz","cross_cats":[],"headline":"In compute-optimal regimes, language model parameter counts scale with the byte volume of data rather than the number of tokens.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-02T01:53:22Z","title":"Compute Optimal Tokenization"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.01188","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-09T15:23:12.731110Z","id":"3a2d3b34-1c92-453d-a8bc-f7abbcb3ba6e","model_set":{"reader":"grok-4.3"},"one_line_summary":"Compute-optimal language models require parameter count to scale with data bytes rather than tokens, with optimal token compression rate decreasing as compute budget grows.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"In compute-optimal regimes, language model parameter counts scale with the byte volume of data rather than the number of tokens.","strongest_claim":"in compute-optimal configurations, model parameter counts scale proportionally to data size measured in bytes, not in tokens as commonly perceived (Kaplan et al., 2020; Hoffmann et al., 2022)","weakest_assumption":"That the behavior of latent tokenized BLT models generalizes to standard subword tokenizers and that the observed scaling trends extend beyond the tested range up to 7B parameters."}},"verdict_id":"3a2d3b34-1c92-453d-a8bc-f7abbcb3ba6e"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2d828196424d66b54755e998940dc3e74b0362094f94c16f1dcb30a79b647f9d","target":"record","created_at":"2026-05-27T02:06:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"e7fe2f567d44a77bb36cdea77cec8567704cde4940c5a000ac85c1215ff375d4","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-02T01:53:22Z","title_canon_sha256":"2ba189736d0694a11dca2c844800d19f7f4888957472955e501cd91a3fcce290"},"schema_version":"1.0","source":{"id":"2605.01188","kind":"arxiv","version":2}},"canonical_sha256":"c2ba242f1d8edc15526b1e630f5c98d48a21f4d659242a900ec11432447ec65e","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c2ba242f1d8edc15526b1e630f5c98d48a21f4d659242a900ec11432447ec65e","first_computed_at":"2026-05-27T02:06:14.186783Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-27T02:06:14.186783Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"DZG9GuqxuvDdZWch50OBdGaElfeT74BpZJN0sys54UX9rz+PK9ZwrctPx1dvv+fgdp/Vae0rMugSf0UjA3u1AQ==","signature_status":"signed_v1","signed_at":"2026-05-27T02:06:14.187623Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.01188","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2d828196424d66b54755e998940dc3e74b0362094f94c16f1dcb30a79b647f9d","sha256:b4298431210dc71148e8025f6bda79ed27ee15985047b724de64b09be1fb3cc4"],"state_sha256":"401cade3d9d50ef829f7df9683e2d20bc0de6a2efc9444c0c9038c6a963d2ae4"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"rKMycJB6vG9rTRwzg7OY2bJyhPfwYU/7STbACT8AyGaRoPnLSw4G3qdpqXV3xiHFs01T0KYK1pDv5pcOmEEkCA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T21:24:01.744747Z","bundle_sha256":"b541d2ce85e2e7d0a2d62f588d5fecb96d0acb84a121902fdc2d717dad971e43"}}