{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:R7QK6ECMLIEG56GVPQGZMAFZZN","short_pith_number":"pith:R7QK6ECM","canonical_record":{"source":{"id":"2601.03019","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"q-bio.GN","submitted_at":"2026-01-06T13:46:42Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"948e9ad2ef66c65da7113e87b9d5fc23433e87a6bc4aebf58051f5d171830120","abstract_canon_sha256":"f7892e7e0deb502203d9649ddf5220b33e2b48036618cf4f3e37f41d75fe900d"},"schema_version":"1.0"},"canonical_sha256":"8fe0af104c5a086ef8d57c0d9600b9cb6048a555edb6fa0570caaa8c1432748d","source":{"kind":"arxiv","id":"2601.03019","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.03019","created_at":"2026-05-21T01:05:14Z"},{"alias_kind":"arxiv_version","alias_value":"2601.03019v4","created_at":"2026-05-21T01:05:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.03019","created_at":"2026-05-21T01:05:14Z"},{"alias_kind":"pith_short_12","alias_value":"R7QK6ECMLIEG","created_at":"2026-05-21T01:05:14Z"},{"alias_kind":"pith_short_16","alias_value":"R7QK6ECMLIEG56GV","created_at":"2026-05-21T01:05:14Z"},{"alias_kind":"pith_short_8","alias_value":"R7QK6ECM","created_at":"2026-05-21T01:05:14Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:R7QK6ECMLIEG56GVPQGZMAFZZN","target":"record","payload":{"canonical_record":{"source":{"id":"2601.03019","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"q-bio.GN","submitted_at":"2026-01-06T13:46:42Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"948e9ad2ef66c65da7113e87b9d5fc23433e87a6bc4aebf58051f5d171830120","abstract_canon_sha256":"f7892e7e0deb502203d9649ddf5220b33e2b48036618cf4f3e37f41d75fe900d"},"schema_version":"1.0"},"canonical_sha256":"8fe0af104c5a086ef8d57c0d9600b9cb6048a555edb6fa0570caaa8c1432748d","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T01:05:14.335464Z","signature_b64":"pNjxJ1IV6ujSTF9p69kyicZ5By2/BWaASrSihwiIShA4qIYdBRsWHuk28lE1CMqVAhgBJLmntK/kZlhAsQWqCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8fe0af104c5a086ef8d57c0d9600b9cb6048a555edb6fa0570caaa8c1432748d","last_reissued_at":"2026-05-21T01:05:14.334510Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T01:05:14.334510Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2601.03019","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-21T01:05:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"hdPGJSR/xsqjI51UJPePpgjKrng1InUhAL39S6QJtd7BusFxwBGu+3OfYnCtn2MgFM3cnNVRqkqDGUKcclUjBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T07:53:13.071186Z"},"content_sha256":"198a0d05fcdf811e164517e5aa015f45d98c2e2d9b09606837a5a0bcbbd29108","schema_version":"1.0","event_id":"sha256:198a0d05fcdf811e164517e5aa015f45d98c2e2d9b09606837a5a0bcbbd29108"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:R7QK6ECMLIEG56GVPQGZMAFZZN","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"DNACHUNKER: Learnable Tokenization for DNA Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"q-bio.GN","authors_text":"Hyomin Kim, Insu Han, Jihwan Shin, Jonghoon Lee, Sungsoo Ahn, Taewon Kim, Won-Chul Lee, Youngmok Jung","submitted_at":"2026-01-06T13:46:42Z","abstract_excerpt":"DNA language models are increasingly used to represent genomic sequence, yet their effectiveness depends critically on how raw nucleotides are converted into model inputs. Unlike natural language, DNA offers no canonical boundaries, making fixed tokenizations a brittle design choice under shifts, indels, and local repeats. We introduce DNAChunker, a masked DNA language model that incorporates a learnable adaptive segmentation module to produce context-dependent, variable-length units. Building on a dynamic segmentation procedure, DNAChunker learns to allocate finer granularity to functionally "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.03019","kind":"arxiv","version":4},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.03019/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-21T01:05:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"NMqTD3S91YHEBbVbRt9cddqY3nIdwvK75onw8R8MqdebdMAgksDtf9Ot+HHIYVwKMZxpIyFU9iTD/lB4pBQyCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T07:53:13.071872Z"},"content_sha256":"cfef38b80c272baf8ca637d64eb988f981d2fb9785bebc5baeffd1c674c848f4","schema_version":"1.0","event_id":"sha256:cfef38b80c272baf8ca637d64eb988f981d2fb9785bebc5baeffd1c674c848f4"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN/bundle.json","state_url":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/R7QK6ECMLIEG56GVPQGZMAFZZN/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T07:53:13Z","links":{"resolver":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN","bundle":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN/bundle.json","state":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN/state.json","well_known_bundle":"https://pith.science/.well-known/pith/R7QK6ECMLIEG56GVPQGZMAFZZN/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:R7QK6ECMLIEG56GVPQGZMAFZZN","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f7892e7e0deb502203d9649ddf5220b33e2b48036618cf4f3e37f41d75fe900d","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"q-bio.GN","submitted_at":"2026-01-06T13:46:42Z","title_canon_sha256":"948e9ad2ef66c65da7113e87b9d5fc23433e87a6bc4aebf58051f5d171830120"},"schema_version":"1.0","source":{"id":"2601.03019","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.03019","created_at":"2026-05-21T01:05:14Z"},{"alias_kind":"arxiv_version","alias_value":"2601.03019v4","created_at":"2026-05-21T01:05:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.03019","created_at":"2026-05-21T01:05:14Z"},{"alias_kind":"pith_short_12","alias_value":"R7QK6ECMLIEG","created_at":"2026-05-21T01:05:14Z"},{"alias_kind":"pith_short_16","alias_value":"R7QK6ECMLIEG56GV","created_at":"2026-05-21T01:05:14Z"},{"alias_kind":"pith_short_8","alias_value":"R7QK6ECM","created_at":"2026-05-21T01:05:14Z"}],"graph_snapshots":[{"event_id":"sha256:cfef38b80c272baf8ca637d64eb988f981d2fb9785bebc5baeffd1c674c848f4","target":"graph","created_at":"2026-05-21T01:05:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2601.03019/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"DNA language models are increasingly used to represent genomic sequence, yet their effectiveness depends critically on how raw nucleotides are converted into model inputs. Unlike natural language, DNA offers no canonical boundaries, making fixed tokenizations a brittle design choice under shifts, indels, and local repeats. We introduce DNAChunker, a masked DNA language model that incorporates a learnable adaptive segmentation module to produce context-dependent, variable-length units. Building on a dynamic segmentation procedure, DNAChunker learns to allocate finer granularity to functionally ","authors_text":"Hyomin Kim, Insu Han, Jihwan Shin, Jonghoon Lee, Sungsoo Ahn, Taewon Kim, Won-Chul Lee, Youngmok Jung","cross_cats":["cs.CL"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"q-bio.GN","submitted_at":"2026-01-06T13:46:42Z","title":"DNACHUNKER: Learnable Tokenization for DNA Language Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.03019","kind":"arxiv","version":4},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:198a0d05fcdf811e164517e5aa015f45d98c2e2d9b09606837a5a0bcbbd29108","target":"record","created_at":"2026-05-21T01:05:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f7892e7e0deb502203d9649ddf5220b33e2b48036618cf4f3e37f41d75fe900d","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"q-bio.GN","submitted_at":"2026-01-06T13:46:42Z","title_canon_sha256":"948e9ad2ef66c65da7113e87b9d5fc23433e87a6bc4aebf58051f5d171830120"},"schema_version":"1.0","source":{"id":"2601.03019","kind":"arxiv","version":4}},"canonical_sha256":"8fe0af104c5a086ef8d57c0d9600b9cb6048a555edb6fa0570caaa8c1432748d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"8fe0af104c5a086ef8d57c0d9600b9cb6048a555edb6fa0570caaa8c1432748d","first_computed_at":"2026-05-21T01:05:14.334510Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-21T01:05:14.334510Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"pNjxJ1IV6ujSTF9p69kyicZ5By2/BWaASrSihwiIShA4qIYdBRsWHuk28lE1CMqVAhgBJLmntK/kZlhAsQWqCQ==","signature_status":"signed_v1","signed_at":"2026-05-21T01:05:14.335464Z","signed_message":"canonical_sha256_bytes"},"source_id":"2601.03019","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:198a0d05fcdf811e164517e5aa015f45d98c2e2d9b09606837a5a0bcbbd29108","sha256:cfef38b80c272baf8ca637d64eb988f981d2fb9785bebc5baeffd1c674c848f4"],"state_sha256":"22d7a4a474e436b7ae635885a65922271a02a94443802328c88c2ce2170e44ea"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"3BAr7V4atrIVr9KaqWv9r6YPdbqjmztsiH+b/JZzDsT5hTcFIuwzrfxodekzPwqYQmcbD3N51cQgTV1Dkt4hAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T07:53:13.075250Z","bundle_sha256":"f1fab1aea212f4623cff4a43c1edb8bf2620de54f1e86d74649101e56a9f22dc"}}