{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:VTOLHSYY4MMZESS764EAYMOK3I","short_pith_number":"pith:VTOLHSYY","canonical_record":{"source":{"id":"2605.22705","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-21T16:46:23Z","cross_cats_sorted":[],"title_canon_sha256":"3ebe19cb16adbc2fe1249d17bbd0a09eb424d104b79351e4ace4af894c3d3e7e","abstract_canon_sha256":"4546cb428a7f68a7694e98cad586fdf3987598ba0154b07ce4008f45ac0071b8"},"schema_version":"1.0"},"canonical_sha256":"acdcb3cb18e319924a5ff7080c31cada11b7c75c83911c9015f5375d649408a4","source":{"kind":"arxiv","id":"2605.22705","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.22705","created_at":"2026-05-22T02:04:51Z"},{"alias_kind":"arxiv_version","alias_value":"2605.22705v1","created_at":"2026-05-22T02:04:51Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.22705","created_at":"2026-05-22T02:04:51Z"},{"alias_kind":"pith_short_12","alias_value":"VTOLHSYY4MMZ","created_at":"2026-05-22T02:04:51Z"},{"alias_kind":"pith_short_16","alias_value":"VTOLHSYY4MMZESS7","created_at":"2026-05-22T02:04:51Z"},{"alias_kind":"pith_short_8","alias_value":"VTOLHSYY","created_at":"2026-05-22T02:04:51Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:VTOLHSYY4MMZESS764EAYMOK3I","target":"record","payload":{"canonical_record":{"source":{"id":"2605.22705","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-21T16:46:23Z","cross_cats_sorted":[],"title_canon_sha256":"3ebe19cb16adbc2fe1249d17bbd0a09eb424d104b79351e4ace4af894c3d3e7e","abstract_canon_sha256":"4546cb428a7f68a7694e98cad586fdf3987598ba0154b07ce4008f45ac0071b8"},"schema_version":"1.0"},"canonical_sha256":"acdcb3cb18e319924a5ff7080c31cada11b7c75c83911c9015f5375d649408a4","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T02:04:51.310573Z","signature_b64":"sphk4A8AToiWOm4uiunKf7S/LGMWf5fZenb0JRWQHkOTwaQj7RX0/WJqZZC15ZpbWoNSXrzHiCf4Ehn1/xAUBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"acdcb3cb18e319924a5ff7080c31cada11b7c75c83911c9015f5375d649408a4","last_reissued_at":"2026-05-22T02:04:51.309979Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T02:04:51.309979Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.22705","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-22T02:04:51Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"BcgniHWBh4/T3ka9P3pIykCFy4vfpcCA1YalebXFf0BXSAobTKYteqaFz2FBMb041zzsdQHsCME1ATYW0TljBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T17:54:32.873640Z"},"content_sha256":"2816308d9ab5bb9a6d145096a3aaca1fe59c4cd7e4ee1bf6653ec47316e91b55","schema_version":"1.0","event_id":"sha256:2816308d9ab5bb9a6d145096a3aaca1fe59c4cd7e4ee1bf6653ec47316e91b55"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:VTOLHSYY4MMZESS764EAYMOK3I","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Tokenization with Split Trees","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Adam Wiemerslage, Chris Tanner, Craig W. Schmidt, Michael Krumdick, Seth Ebner, Varshini Reddy, Yuval Pinter","submitted_at":"2026-05-21T16:46:23Z","abstract_excerpt":"We introduce Tokenization with Split Trees (ToaST), a subword tokenization method that directly optimizes compression under a new recursive inference procedure. ToaST greedily splits each pretoken into a full binary tree using precomputed byte n-gram counts, independent of any vocabulary. Given a vocabulary, inference recursively descends each split tree and emits the first in-vocabulary node reached on each path. Vocabulary selection is formulated as an Integer Program (IP) that minimizes the total token count over all split trees under this inference procedure. The Linear Programming (LP) re"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.22705","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.22705/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-22T02:04:51Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"HZ/OxWAfTXxPLHKqvl35IMdCnuz6O+ibD3UQkpNJxB2ijEKx+vQ7H5QUHyxT5DWysorTzRk50u2jNitw4yAqBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T17:54:32.874346Z"},"content_sha256":"d5a6547ebed82d836393b7eaa90103fe744d568761b48f2e06cf0c0b6f76ff82","schema_version":"1.0","event_id":"sha256:d5a6547ebed82d836393b7eaa90103fe744d568761b48f2e06cf0c0b6f76ff82"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/VTOLHSYY4MMZESS764EAYMOK3I/bundle.json","state_url":"https://pith.science/pith/VTOLHSYY4MMZESS764EAYMOK3I/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/VTOLHSYY4MMZESS764EAYMOK3I/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T17:54:32Z","links":{"resolver":"https://pith.science/pith/VTOLHSYY4MMZESS764EAYMOK3I","bundle":"https://pith.science/pith/VTOLHSYY4MMZESS764EAYMOK3I/bundle.json","state":"https://pith.science/pith/VTOLHSYY4MMZESS764EAYMOK3I/state.json","well_known_bundle":"https://pith.science/.well-known/pith/VTOLHSYY4MMZESS764EAYMOK3I/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:VTOLHSYY4MMZESS764EAYMOK3I","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"4546cb428a7f68a7694e98cad586fdf3987598ba0154b07ce4008f45ac0071b8","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-21T16:46:23Z","title_canon_sha256":"3ebe19cb16adbc2fe1249d17bbd0a09eb424d104b79351e4ace4af894c3d3e7e"},"schema_version":"1.0","source":{"id":"2605.22705","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.22705","created_at":"2026-05-22T02:04:51Z"},{"alias_kind":"arxiv_version","alias_value":"2605.22705v1","created_at":"2026-05-22T02:04:51Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.22705","created_at":"2026-05-22T02:04:51Z"},{"alias_kind":"pith_short_12","alias_value":"VTOLHSYY4MMZ","created_at":"2026-05-22T02:04:51Z"},{"alias_kind":"pith_short_16","alias_value":"VTOLHSYY4MMZESS7","created_at":"2026-05-22T02:04:51Z"},{"alias_kind":"pith_short_8","alias_value":"VTOLHSYY","created_at":"2026-05-22T02:04:51Z"}],"graph_snapshots":[{"event_id":"sha256:d5a6547ebed82d836393b7eaa90103fe744d568761b48f2e06cf0c0b6f76ff82","target":"graph","created_at":"2026-05-22T02:04:51Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.22705/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"We introduce Tokenization with Split Trees (ToaST), a subword tokenization method that directly optimizes compression under a new recursive inference procedure. ToaST greedily splits each pretoken into a full binary tree using precomputed byte n-gram counts, independent of any vocabulary. Given a vocabulary, inference recursively descends each split tree and emits the first in-vocabulary node reached on each path. Vocabulary selection is formulated as an Integer Program (IP) that minimizes the total token count over all split trees under this inference procedure. The Linear Programming (LP) re","authors_text":"Adam Wiemerslage, Chris Tanner, Craig W. Schmidt, Michael Krumdick, Seth Ebner, Varshini Reddy, Yuval Pinter","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-21T16:46:23Z","title":"Tokenization with Split Trees"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.22705","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2816308d9ab5bb9a6d145096a3aaca1fe59c4cd7e4ee1bf6653ec47316e91b55","target":"record","created_at":"2026-05-22T02:04:51Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"4546cb428a7f68a7694e98cad586fdf3987598ba0154b07ce4008f45ac0071b8","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-21T16:46:23Z","title_canon_sha256":"3ebe19cb16adbc2fe1249d17bbd0a09eb424d104b79351e4ace4af894c3d3e7e"},"schema_version":"1.0","source":{"id":"2605.22705","kind":"arxiv","version":1}},"canonical_sha256":"acdcb3cb18e319924a5ff7080c31cada11b7c75c83911c9015f5375d649408a4","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"acdcb3cb18e319924a5ff7080c31cada11b7c75c83911c9015f5375d649408a4","first_computed_at":"2026-05-22T02:04:51.309979Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-22T02:04:51.309979Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"sphk4A8AToiWOm4uiunKf7S/LGMWf5fZenb0JRWQHkOTwaQj7RX0/WJqZZC15ZpbWoNSXrzHiCf4Ehn1/xAUBQ==","signature_status":"signed_v1","signed_at":"2026-05-22T02:04:51.310573Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.22705","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2816308d9ab5bb9a6d145096a3aaca1fe59c4cd7e4ee1bf6653ec47316e91b55","sha256:d5a6547ebed82d836393b7eaa90103fe744d568761b48f2e06cf0c0b6f76ff82"],"state_sha256":"948799e5d3b7362e8a947b31bf1600858704048d0c2b15194de328570a756731"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"X58uXHPjRzHoOt+GpQWx9ZaPEuc9RRiCLoS1qHA26tN/UjiiOPfiu90coZoSmM82poLoiGISuned+VXC1PBiAQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T17:54:32.878278Z","bundle_sha256":"86a26cbc63763658cad9aa7a75ba99aec3a8213796076b0155dead3c5517e46c"}}