{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:R7QK6ECMLIEG56GVPQGZMAFZZN","short_pith_number":"pith:R7QK6ECM","schema_version":"1.0","canonical_sha256":"8fe0af104c5a086ef8d57c0d9600b9cb6048a555edb6fa0570caaa8c1432748d","source":{"kind":"arxiv","id":"2601.03019","version":4},"attestation_state":"computed","paper":{"title":"DNACHUNKER: Learnable Tokenization for DNA Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"q-bio.GN","authors_text":"Hyomin Kim, Insu Han, Jihwan Shin, Jonghoon Lee, Sungsoo Ahn, Taewon Kim, Won-Chul Lee, Youngmok Jung","submitted_at":"2026-01-06T13:46:42Z","abstract_excerpt":"DNA language models are increasingly used to represent genomic sequence, yet their effectiveness depends critically on how raw nucleotides are converted into model inputs. Unlike natural language, DNA offers no canonical boundaries, making fixed tokenizations a brittle design choice under shifts, indels, and local repeats. We introduce DNAChunker, a masked DNA language model that incorporates a learnable adaptive segmentation module to produce context-dependent, variable-length units. Building on a dynamic segmentation procedure, DNAChunker learns to allocate finer granularity to functionally "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2601.03019","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"q-bio.GN","submitted_at":"2026-01-06T13:46:42Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"948e9ad2ef66c65da7113e87b9d5fc23433e87a6bc4aebf58051f5d171830120","abstract_canon_sha256":"f7892e7e0deb502203d9649ddf5220b33e2b48036618cf4f3e37f41d75fe900d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T01:05:14.335464Z","signature_b64":"pNjxJ1IV6ujSTF9p69kyicZ5By2/BWaASrSihwiIShA4qIYdBRsWHuk28lE1CMqVAhgBJLmntK/kZlhAsQWqCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8fe0af104c5a086ef8d57c0d9600b9cb6048a555edb6fa0570caaa8c1432748d","last_reissued_at":"2026-05-21T01:05:14.334510Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T01:05:14.334510Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"DNACHUNKER: Learnable Tokenization for DNA Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"q-bio.GN","authors_text":"Hyomin Kim, Insu Han, Jihwan Shin, Jonghoon Lee, Sungsoo Ahn, Taewon Kim, Won-Chul Lee, Youngmok Jung","submitted_at":"2026-01-06T13:46:42Z","abstract_excerpt":"DNA language models are increasingly used to represent genomic sequence, yet their effectiveness depends critically on how raw nucleotides are converted into model inputs. Unlike natural language, DNA offers no canonical boundaries, making fixed tokenizations a brittle design choice under shifts, indels, and local repeats. We introduce DNAChunker, a masked DNA language model that incorporates a learnable adaptive segmentation module to produce context-dependent, variable-length units. Building on a dynamic segmentation procedure, DNAChunker learns to allocate finer granularity to functionally "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.03019","kind":"arxiv","version":4},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.03019/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2601.03019","created_at":"2026-05-21T01:05:14.334648+00:00"},{"alias_kind":"arxiv_version","alias_value":"2601.03019v4","created_at":"2026-05-21T01:05:14.334648+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.03019","created_at":"2026-05-21T01:05:14.334648+00:00"},{"alias_kind":"pith_short_12","alias_value":"R7QK6ECMLIEG","created_at":"2026-05-21T01:05:14.334648+00:00"},{"alias_kind":"pith_short_16","alias_value":"R7QK6ECMLIEG56GV","created_at":"2026-05-21T01:05:14.334648+00:00"},{"alias_kind":"pith_short_8","alias_value":"R7QK6ECM","created_at":"2026-05-21T01:05:14.334648+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN","json":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN.json","graph_json":"https://pith.science/api/pith-number/R7QK6ECMLIEG56GVPQGZMAFZZN/graph.json","events_json":"https://pith.science/api/pith-number/R7QK6ECMLIEG56GVPQGZMAFZZN/events.json","paper":"https://pith.science/paper/R7QK6ECM"},"agent_actions":{"view_html":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN","download_json":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN.json","view_paper":"https://pith.science/paper/R7QK6ECM","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2601.03019&json=true","fetch_graph":"https://pith.science/api/pith-number/R7QK6ECMLIEG56GVPQGZMAFZZN/graph.json","fetch_events":"https://pith.science/api/pith-number/R7QK6ECMLIEG56GVPQGZMAFZZN/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN/action/timestamp_anchor","attest_storage":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN/action/storage_attestation","attest_author":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN/action/author_attestation","sign_citation":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN/action/citation_signature","submit_replication":"https://pith.science/pith/R7QK6ECMLIEG56GVPQGZMAFZZN/action/replication_record"}},"created_at":"2026-05-21T01:05:14.334648+00:00","updated_at":"2026-05-21T01:05:14.334648+00:00"}