{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:NQUAIQAFLUPCQ26TTT2VNWOCWX","short_pith_number":"pith:NQUAIQAF","schema_version":"1.0","canonical_sha256":"6c280440055d1e286bd39cf556d9c2b5db6621e7680d6ce35930cc8d8523f6a8","source":{"kind":"arxiv","id":"2605.17379","version":1},"attestation_state":"computed","paper":{"title":"Learning Faster with Better Tokens: Parameter-Efficient Vocabulary Adaptation for Specialized Text Summarization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Gunjan Balde, Mainack Mondal, Niloy Ganguly, Soumyadeep Roy","submitted_at":"2026-05-17T10:45:01Z","abstract_excerpt":"Large language models pretrained on general-domain corpora often exhibit tokenization inefficiencies when applied to specialized domains. Although continual pretraining for domain adaptation partially alleviate performance degradation, it does not resolve the fundamental vocabulary mismatch. To address this gap, we introduce a targeted parameter-efficient domain adaptation approach that combines vocabulary adaptation with pretraining for LLM-based text summarization. Our unified framework augments pretrained tokenizers with domain-specific tokens while selectively replacing under-trained and u"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.17379","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-17T10:45:01Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"50fa848f2f1c4de9deb70bf5df073e3722fc7bdfbb83c46bc7dd5416e0defe12","abstract_canon_sha256":"fe8b12425a7e6180efd3d9e192be888370064fc73d8cd810b7d7982913ff7b15"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:03:55.583282Z","signature_b64":"8TOzsmQDsJ4GMi4HNih/JIm9Ca1szCMdA/xXVwrxHvek8y9DKOeWwShu8gPIBJYC2m3xbghqpYThfuR7GgSrBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6c280440055d1e286bd39cf556d9c2b5db6621e7680d6ce35930cc8d8523f6a8","last_reissued_at":"2026-05-20T00:03:55.582504Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:03:55.582504Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Learning Faster with Better Tokens: Parameter-Efficient Vocabulary Adaptation for Specialized Text Summarization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Gunjan Balde, Mainack Mondal, Niloy Ganguly, Soumyadeep Roy","submitted_at":"2026-05-17T10:45:01Z","abstract_excerpt":"Large language models pretrained on general-domain corpora often exhibit tokenization inefficiencies when applied to specialized domains. Although continual pretraining for domain adaptation partially alleviate performance degradation, it does not resolve the fundamental vocabulary mismatch. To address this gap, we introduce a targeted parameter-efficient domain adaptation approach that combines vocabulary adaptation with pretraining for LLM-based text summarization. Our unified framework augments pretrained tokenizers with domain-specific tokens while selectively replacing under-trained and u"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.17379","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.17379/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"claim_evidence","ran_at":"2026-05-19T21:41:57.771398Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T21:33:23.709333Z","status":"skipped","version":"1.0.0","findings_count":0}],"snapshot_sha256":"f2aca72447b7df6b4a5d590a02f5afd46778ec987198531ff21062006b485194"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.17379","created_at":"2026-05-20T00:03:55.582618+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.17379v1","created_at":"2026-05-20T00:03:55.582618+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.17379","created_at":"2026-05-20T00:03:55.582618+00:00"},{"alias_kind":"pith_short_12","alias_value":"NQUAIQAFLUPC","created_at":"2026-05-20T00:03:55.582618+00:00"},{"alias_kind":"pith_short_16","alias_value":"NQUAIQAFLUPCQ26T","created_at":"2026-05-20T00:03:55.582618+00:00"},{"alias_kind":"pith_short_8","alias_value":"NQUAIQAF","created_at":"2026-05-20T00:03:55.582618+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/NQUAIQAFLUPCQ26TTT2VNWOCWX","json":"https://pith.science/pith/NQUAIQAFLUPCQ26TTT2VNWOCWX.json","graph_json":"https://pith.science/api/pith-number/NQUAIQAFLUPCQ26TTT2VNWOCWX/graph.json","events_json":"https://pith.science/api/pith-number/NQUAIQAFLUPCQ26TTT2VNWOCWX/events.json","paper":"https://pith.science/paper/NQUAIQAF"},"agent_actions":{"view_html":"https://pith.science/pith/NQUAIQAFLUPCQ26TTT2VNWOCWX","download_json":"https://pith.science/pith/NQUAIQAFLUPCQ26TTT2VNWOCWX.json","view_paper":"https://pith.science/paper/NQUAIQAF","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.17379&json=true","fetch_graph":"https://pith.science/api/pith-number/NQUAIQAFLUPCQ26TTT2VNWOCWX/graph.json","fetch_events":"https://pith.science/api/pith-number/NQUAIQAFLUPCQ26TTT2VNWOCWX/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/NQUAIQAFLUPCQ26TTT2VNWOCWX/action/timestamp_anchor","attest_storage":"https://pith.science/pith/NQUAIQAFLUPCQ26TTT2VNWOCWX/action/storage_attestation","attest_author":"https://pith.science/pith/NQUAIQAFLUPCQ26TTT2VNWOCWX/action/author_attestation","sign_citation":"https://pith.science/pith/NQUAIQAFLUPCQ26TTT2VNWOCWX/action/citation_signature","submit_replication":"https://pith.science/pith/NQUAIQAFLUPCQ26TTT2VNWOCWX/action/replication_record"}},"created_at":"2026-05-20T00:03:55.582618+00:00","updated_at":"2026-05-20T00:03:55.582618+00:00"}