{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:AWMSQSXDDX6SNWCV64B6F4QOQS","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5bf7faaf0e93efbb9506f9eaf5ad242fe9961b64fed10b773131c4a36cf96330","cross_cats_sorted":["cs.SD"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T06:19:42Z","title_canon_sha256":"02173003b7a20c7ba5fa0b8df3d30825fb729eb4ed6a974c3aa1421b70248b53"},"schema_version":"1.0","source":{"id":"2605.14427","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14427","created_at":"2026-05-17T23:39:07Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14427v1","created_at":"2026-05-17T23:39:07Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14427","created_at":"2026-05-17T23:39:07Z"},{"alias_kind":"pith_short_12","alias_value":"AWMSQSXDDX6S","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"AWMSQSXDDX6SNWCV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"AWMSQSXD","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:949cd2071d9d752263bd9f162eeb415099793a4c84f217f3991ccde6ab969c9b","target":"graph","created_at":"2026-05-17T23:39:07Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We demonstrate the utility and usefulness of our approach by applying it on a standard Librispeech corpus and show that the optimal choice of vocabulary size hyper-parameter improves the performance of the ASR."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That a curve fitted to the cost function data will have a differentiable minimum identifiable by first and second derivative tests, and that this mathematical optimum will produce measurably better ASR performance on held-out data."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Curve fitting and calculus derivative tests on a tokenization cost function identify an optimal vocabulary size that improves end-to-end ASR performance on Librispeech."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Calculus locates the optimal vocabulary size for end-to-end ASR by fitting a cost curve and applying derivative tests."}],"snapshot_sha256":"0263389b0e2f1f4f5a9ce1d1aa57a6db9bc901f89cc3a1ef46fe4dfa89981426"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"52f25b64deb19bf9bbee5b9e3f360954a18137e16abd22fb38093c77b6f4bdba"},"paper":{"abstract_excerpt":"In hybrid automatic speech recognition (ASR) systems, the vocabulary size is unambiguous, typically determined by the number of phones, bi-phones, or tri-phones present in the language. In contrast, end-to-end ASR systems derive their vocabulary, often referred to as tokens from the text corpus used for training. The choice and, more importantly, the size of this vocabulary is a critical hyper-parameter in training end-to-end ASR systems. Tokenization algorithms such as Byte Pair Encoding (BPE), WordPiece, and Unigram Language Model (ULM) use the vocabulary size as an input hyper-parameter to ","authors_text":"Sunil Kumar Kopparapu","cross_cats":["cs.SD"],"headline":"Calculus locates the optimal vocabulary size for end-to-end ASR by fitting a cost curve and applying derivative tests.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T06:19:42Z","title":"A Calculus-Based Framework for Determining Vocabulary Size in End-to-End ASR"},"references":{"count":13,"internal_anchors":0,"resolved_work":13,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"A cost minimization approach to fix the vocabulary size in a tokenizer for an end-to-end ASR system,","work_id":"295ba0f9-2df3-4fd3-84ba-1b1b0387f424","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Librispeech ASR corpus: train-clean-100,","work_id":"b03979c8-8ea1-4017-b0f8-cc645a16dc58","year":2015},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"D. C. Montgomery, E. A. Peck, and G. G. Vining,Introduction to Linear Regression Analysis, 5th ed. Hoboken, NJ: John Wiley & Sons,","work_id":"9e3877dc-f7ff-4fa7-96d4-f209f9c0815e","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"[Online]. Available: https://www.wiley.com/en-us/Introduction+ to+Linear+Regression+Analysis,+5th+Edition-p-9781119578727","work_id":"35b01cf2-232b-4065-bef3-9f43e92c481e","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"ESPnet: End-to-End Speech Processing Toolkit,","work_id":"92255ea1-947e-4e96-893b-e43fea4ae518","year":2018}],"snapshot_sha256":"35122c0ddecccc217d28bd53e50eefe054468204824bea12868252fd02889c45"},"source":{"id":"2605.14427","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T02:04:22.903529Z","id":"26ff0e22-fde7-496a-9abf-2f629b64ff3a","model_set":{"reader":"grok-4.3"},"one_line_summary":"Curve fitting and calculus derivative tests on a tokenization cost function identify an optimal vocabulary size that improves end-to-end ASR performance on Librispeech.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Calculus locates the optimal vocabulary size for end-to-end ASR by fitting a cost curve and applying derivative tests.","strongest_claim":"We demonstrate the utility and usefulness of our approach by applying it on a standard Librispeech corpus and show that the optimal choice of vocabulary size hyper-parameter improves the performance of the ASR.","weakest_assumption":"That a curve fitted to the cost function data will have a differentiable minimum identifiable by first and second derivative tests, and that this mathematical optimum will produce measurably better ASR performance on held-out data."}},"verdict_id":"26ff0e22-fde7-496a-9abf-2f629b64ff3a"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:6432e8aae3234b94b842532518107240f92d935b94f56e367ea28bde3c08fa58","target":"record","created_at":"2026-05-17T23:39:07Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5bf7faaf0e93efbb9506f9eaf5ad242fe9961b64fed10b773131c4a36cf96330","cross_cats_sorted":["cs.SD"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T06:19:42Z","title_canon_sha256":"02173003b7a20c7ba5fa0b8df3d30825fb729eb4ed6a974c3aa1421b70248b53"},"schema_version":"1.0","source":{"id":"2605.14427","kind":"arxiv","version":1}},"canonical_sha256":"0599284ae31dfd26d855f703e2f20e8483234d8af7c4c39ed2544458fe2615f5","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0599284ae31dfd26d855f703e2f20e8483234d8af7c4c39ed2544458fe2615f5","first_computed_at":"2026-05-17T23:39:07.178032Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:07.178032Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"5oFfaNwypG4NNqSF/pwxA8FUyEtGWlZnT0I03ux0feECD3PubuE5NflKhHFPEXTqduDFkBw0HIviXSslvtf9Bg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:07.178756Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14427","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:6432e8aae3234b94b842532518107240f92d935b94f56e367ea28bde3c08fa58","sha256:949cd2071d9d752263bd9f162eeb415099793a4c84f217f3991ccde6ab969c9b"],"state_sha256":"cb8ad44fb264031de732e3cbe31690d48a6886d7b4638c5b0fd4fffb6aa97c3b"}