{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:AWMSQSXDDX6SNWCV64B6F4QOQS","short_pith_number":"pith:AWMSQSXD","canonical_record":{"source":{"id":"2605.14427","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T06:19:42Z","cross_cats_sorted":["cs.SD"],"title_canon_sha256":"02173003b7a20c7ba5fa0b8df3d30825fb729eb4ed6a974c3aa1421b70248b53","abstract_canon_sha256":"5bf7faaf0e93efbb9506f9eaf5ad242fe9961b64fed10b773131c4a36cf96330"},"schema_version":"1.0"},"canonical_sha256":"0599284ae31dfd26d855f703e2f20e8483234d8af7c4c39ed2544458fe2615f5","source":{"kind":"arxiv","id":"2605.14427","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14427","created_at":"2026-05-17T23:39:07Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14427v1","created_at":"2026-05-17T23:39:07Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14427","created_at":"2026-05-17T23:39:07Z"},{"alias_kind":"pith_short_12","alias_value":"AWMSQSXDDX6S","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"AWMSQSXDDX6SNWCV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"AWMSQSXD","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:AWMSQSXDDX6SNWCV64B6F4QOQS","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14427","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T06:19:42Z","cross_cats_sorted":["cs.SD"],"title_canon_sha256":"02173003b7a20c7ba5fa0b8df3d30825fb729eb4ed6a974c3aa1421b70248b53","abstract_canon_sha256":"5bf7faaf0e93efbb9506f9eaf5ad242fe9961b64fed10b773131c4a36cf96330"},"schema_version":"1.0"},"canonical_sha256":"0599284ae31dfd26d855f703e2f20e8483234d8af7c4c39ed2544458fe2615f5","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:07.178756Z","signature_b64":"5oFfaNwypG4NNqSF/pwxA8FUyEtGWlZnT0I03ux0feECD3PubuE5NflKhHFPEXTqduDFkBw0HIviXSslvtf9Bg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0599284ae31dfd26d855f703e2f20e8483234d8af7c4c39ed2544458fe2615f5","last_reissued_at":"2026-05-17T23:39:07.178032Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:07.178032Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14427","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:07Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"gAJ1bSpPGeQmAybmluyoDeyiwCw3WGgzezYYPoAgZpRSu/LZ5uVpvP9UgsjOMk//MrJUt9xTDZcsCBXPXpWHDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-22T02:52:54.497414Z"},"content_sha256":"6432e8aae3234b94b842532518107240f92d935b94f56e367ea28bde3c08fa58","schema_version":"1.0","event_id":"sha256:6432e8aae3234b94b842532518107240f92d935b94f56e367ea28bde3c08fa58"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:AWMSQSXDDX6SNWCV64B6F4QOQS","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"A Calculus-Based Framework for Determining Vocabulary Size in End-to-End ASR","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Calculus locates the optimal vocabulary size for end-to-end ASR by fitting a cost curve and applying derivative tests.","cross_cats":["cs.SD"],"primary_cat":"cs.CL","authors_text":"Sunil Kumar Kopparapu","submitted_at":"2026-05-14T06:19:42Z","abstract_excerpt":"In hybrid automatic speech recognition (ASR) systems, the vocabulary size is unambiguous, typically determined by the number of phones, bi-phones, or tri-phones present in the language. In contrast, end-to-end ASR systems derive their vocabulary, often referred to as tokens from the text corpus used for training. The choice and, more importantly, the size of this vocabulary is a critical hyper-parameter in training end-to-end ASR systems. Tokenization algorithms such as Byte Pair Encoding (BPE), WordPiece, and Unigram Language Model (ULM) use the vocabulary size as an input hyper-parameter to "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We demonstrate the utility and usefulness of our approach by applying it on a standard Librispeech corpus and show that the optimal choice of vocabulary size hyper-parameter improves the performance of the ASR.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That a curve fitted to the cost function data will have a differentiable minimum identifiable by first and second derivative tests, and that this mathematical optimum will produce measurably better ASR performance on held-out data.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Curve fitting and calculus derivative tests on a tokenization cost function identify an optimal vocabulary size that improves end-to-end ASR performance on Librispeech.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Calculus locates the optimal vocabulary size for end-to-end ASR by fitting a cost curve and applying derivative tests.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"0263389b0e2f1f4f5a9ce1d1aa57a6db9bc901f89cc3a1ef46fe4dfa89981426"},"source":{"id":"2605.14427","kind":"arxiv","version":1},"verdict":{"id":"26ff0e22-fde7-496a-9abf-2f629b64ff3a","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T02:04:22.903529Z","strongest_claim":"We demonstrate the utility and usefulness of our approach by applying it on a standard Librispeech corpus and show that the optimal choice of vocabulary size hyper-parameter improves the performance of the ASR.","one_line_summary":"Curve fitting and calculus derivative tests on a tokenization cost function identify an optimal vocabulary size that improves end-to-end ASR performance on Librispeech.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That a curve fitted to the cost function data will have a differentiable minimum identifiable by first and second derivative tests, and that this mathematical optimum will produce measurably better ASR performance on held-out data.","pith_extraction_headline":"Calculus locates the optimal vocabulary size for end-to-end ASR by fitting a cost curve and applying derivative tests."},"references":{"count":13,"sample":[{"doi":"","year":2024,"title":"A cost minimization approach to fix the vocabulary size in a tokenizer for an end-to-end ASR system,","work_id":"295ba0f9-2df3-4fd3-84ba-1b1b0387f424","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2015,"title":"Librispeech ASR corpus: train-clean-100,","work_id":"b03979c8-8ea1-4017-b0f8-cc645a16dc58","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"D. C. Montgomery, E. A. Peck, and G. G. Vining,Introduction to Linear Regression Analysis, 5th ed. Hoboken, NJ: John Wiley & Sons,","work_id":"9e3877dc-f7ff-4fa7-96d4-f209f9c0815e","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"[Online]. Available: https://www.wiley.com/en-us/Introduction+ to+Linear+Regression+Analysis,+5th+Edition-p-9781119578727","work_id":"35b01cf2-232b-4065-bef3-9f43e92c481e","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2018,"title":"ESPnet: End-to-End Speech Processing Toolkit,","work_id":"92255ea1-947e-4e96-893b-e43fea4ae518","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":13,"snapshot_sha256":"35122c0ddecccc217d28bd53e50eefe054468204824bea12868252fd02889c45","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"52f25b64deb19bf9bbee5b9e3f360954a18137e16abd22fb38093c77b6f4bdba"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"26ff0e22-fde7-496a-9abf-2f629b64ff3a"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:07Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"NdxThtheRVZouuyPaETD5HRp/k2ya8z3O1GmbX3aMg+i6gyILGaBjJuHUqGa9ahSYl8Z+CYwbRhmqVanVupBBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-22T02:52:54.498441Z"},"content_sha256":"949cd2071d9d752263bd9f162eeb415099793a4c84f217f3991ccde6ab969c9b","schema_version":"1.0","event_id":"sha256:949cd2071d9d752263bd9f162eeb415099793a4c84f217f3991ccde6ab969c9b"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/AWMSQSXDDX6SNWCV64B6F4QOQS/bundle.json","state_url":"https://pith.science/pith/AWMSQSXDDX6SNWCV64B6F4QOQS/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/AWMSQSXDDX6SNWCV64B6F4QOQS/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-22T02:52:54Z","links":{"resolver":"https://pith.science/pith/AWMSQSXDDX6SNWCV64B6F4QOQS","bundle":"https://pith.science/pith/AWMSQSXDDX6SNWCV64B6F4QOQS/bundle.json","state":"https://pith.science/pith/AWMSQSXDDX6SNWCV64B6F4QOQS/state.json","well_known_bundle":"https://pith.science/.well-known/pith/AWMSQSXDDX6SNWCV64B6F4QOQS/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:AWMSQSXDDX6SNWCV64B6F4QOQS","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5bf7faaf0e93efbb9506f9eaf5ad242fe9961b64fed10b773131c4a36cf96330","cross_cats_sorted":["cs.SD"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T06:19:42Z","title_canon_sha256":"02173003b7a20c7ba5fa0b8df3d30825fb729eb4ed6a974c3aa1421b70248b53"},"schema_version":"1.0","source":{"id":"2605.14427","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14427","created_at":"2026-05-17T23:39:07Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14427v1","created_at":"2026-05-17T23:39:07Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14427","created_at":"2026-05-17T23:39:07Z"},{"alias_kind":"pith_short_12","alias_value":"AWMSQSXDDX6S","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"AWMSQSXDDX6SNWCV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"AWMSQSXD","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:949cd2071d9d752263bd9f162eeb415099793a4c84f217f3991ccde6ab969c9b","target":"graph","created_at":"2026-05-17T23:39:07Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We demonstrate the utility and usefulness of our approach by applying it on a standard Librispeech corpus and show that the optimal choice of vocabulary size hyper-parameter improves the performance of the ASR."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That a curve fitted to the cost function data will have a differentiable minimum identifiable by first and second derivative tests, and that this mathematical optimum will produce measurably better ASR performance on held-out data."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Curve fitting and calculus derivative tests on a tokenization cost function identify an optimal vocabulary size that improves end-to-end ASR performance on Librispeech."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Calculus locates the optimal vocabulary size for end-to-end ASR by fitting a cost curve and applying derivative tests."}],"snapshot_sha256":"0263389b0e2f1f4f5a9ce1d1aa57a6db9bc901f89cc3a1ef46fe4dfa89981426"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"52f25b64deb19bf9bbee5b9e3f360954a18137e16abd22fb38093c77b6f4bdba"},"paper":{"abstract_excerpt":"In hybrid automatic speech recognition (ASR) systems, the vocabulary size is unambiguous, typically determined by the number of phones, bi-phones, or tri-phones present in the language. In contrast, end-to-end ASR systems derive their vocabulary, often referred to as tokens from the text corpus used for training. The choice and, more importantly, the size of this vocabulary is a critical hyper-parameter in training end-to-end ASR systems. Tokenization algorithms such as Byte Pair Encoding (BPE), WordPiece, and Unigram Language Model (ULM) use the vocabulary size as an input hyper-parameter to ","authors_text":"Sunil Kumar Kopparapu","cross_cats":["cs.SD"],"headline":"Calculus locates the optimal vocabulary size for end-to-end ASR by fitting a cost curve and applying derivative tests.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T06:19:42Z","title":"A Calculus-Based Framework for Determining Vocabulary Size in End-to-End ASR"},"references":{"count":13,"internal_anchors":0,"resolved_work":13,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"A cost minimization approach to fix the vocabulary size in a tokenizer for an end-to-end ASR system,","work_id":"295ba0f9-2df3-4fd3-84ba-1b1b0387f424","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Librispeech ASR corpus: train-clean-100,","work_id":"b03979c8-8ea1-4017-b0f8-cc645a16dc58","year":2015},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"D. C. Montgomery, E. A. Peck, and G. G. Vining,Introduction to Linear Regression Analysis, 5th ed. Hoboken, NJ: John Wiley & Sons,","work_id":"9e3877dc-f7ff-4fa7-96d4-f209f9c0815e","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"[Online]. Available: https://www.wiley.com/en-us/Introduction+ to+Linear+Regression+Analysis,+5th+Edition-p-9781119578727","work_id":"35b01cf2-232b-4065-bef3-9f43e92c481e","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"ESPnet: End-to-End Speech Processing Toolkit,","work_id":"92255ea1-947e-4e96-893b-e43fea4ae518","year":2018}],"snapshot_sha256":"35122c0ddecccc217d28bd53e50eefe054468204824bea12868252fd02889c45"},"source":{"id":"2605.14427","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T02:04:22.903529Z","id":"26ff0e22-fde7-496a-9abf-2f629b64ff3a","model_set":{"reader":"grok-4.3"},"one_line_summary":"Curve fitting and calculus derivative tests on a tokenization cost function identify an optimal vocabulary size that improves end-to-end ASR performance on Librispeech.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Calculus locates the optimal vocabulary size for end-to-end ASR by fitting a cost curve and applying derivative tests.","strongest_claim":"We demonstrate the utility and usefulness of our approach by applying it on a standard Librispeech corpus and show that the optimal choice of vocabulary size hyper-parameter improves the performance of the ASR.","weakest_assumption":"That a curve fitted to the cost function data will have a differentiable minimum identifiable by first and second derivative tests, and that this mathematical optimum will produce measurably better ASR performance on held-out data."}},"verdict_id":"26ff0e22-fde7-496a-9abf-2f629b64ff3a"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:6432e8aae3234b94b842532518107240f92d935b94f56e367ea28bde3c08fa58","target":"record","created_at":"2026-05-17T23:39:07Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5bf7faaf0e93efbb9506f9eaf5ad242fe9961b64fed10b773131c4a36cf96330","cross_cats_sorted":["cs.SD"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T06:19:42Z","title_canon_sha256":"02173003b7a20c7ba5fa0b8df3d30825fb729eb4ed6a974c3aa1421b70248b53"},"schema_version":"1.0","source":{"id":"2605.14427","kind":"arxiv","version":1}},"canonical_sha256":"0599284ae31dfd26d855f703e2f20e8483234d8af7c4c39ed2544458fe2615f5","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0599284ae31dfd26d855f703e2f20e8483234d8af7c4c39ed2544458fe2615f5","first_computed_at":"2026-05-17T23:39:07.178032Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:07.178032Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"5oFfaNwypG4NNqSF/pwxA8FUyEtGWlZnT0I03ux0feECD3PubuE5NflKhHFPEXTqduDFkBw0HIviXSslvtf9Bg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:07.178756Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14427","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:6432e8aae3234b94b842532518107240f92d935b94f56e367ea28bde3c08fa58","sha256:949cd2071d9d752263bd9f162eeb415099793a4c84f217f3991ccde6ab969c9b"],"state_sha256":"cb8ad44fb264031de732e3cbe31690d48a6886d7b4638c5b0fd4fffb6aa97c3b"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"65y9X6KfyNUIYadXhhH8VYPUl8j8O13k2fAjnqGpLdwTp+4hiBXRp+WDoZfXQMmekG5LaGOy+f2nOIeh0ndcCQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-22T02:52:54.501736Z","bundle_sha256":"e1a08734d57d11bcce92ee8daac47a1fa8834a9225662b36221286051d6c9383"}}