{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:2GZ46PQKT24RSRQKHHXGKOD5Q6","short_pith_number":"pith:2GZ46PQK","schema_version":"1.0","canonical_sha256":"d1b3cf3e0a9eb919460a39ee65387d87be312198fa254731ef48933d4d57240f","source":{"kind":"arxiv","id":"2606.19626","version":1},"attestation_state":"computed","paper":{"title":"Toten: Knowledge-Based Ontological Tokenization Of Physical Quantities And Technical Notation In Brazilian Portuguese","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.AI","authors_text":"Antonio de Sousa Leit\\~ao Filho; Allan Kardec Duailibe Barros Filho; Fabr\\'icio Saul Lima; Selby Mykael Lima dos Santos; Rejani Bandeira Vieira Sousa","submitted_at":"2026-06-17T22:06:41Z","abstract_excerpt":"Byte-Pair Encoding tokenization is statistically efficient for vocabulary compression, but semantically blind to structured technical entities, fragmenting physical quantities, numbers, units, and symbolic expressions into lexically arbitrary subwords. We present TOTEN, a knowledge-based ontological tokenization framework that replaces statistical derivation with declarative classification grounded in a formal ontology of engineering entities (OEE). We formalize TOTEN as the triple <O, classify, {inst_tau}>: the ontology gathers types, structural principles, composition relations, and preserva"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.19626","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-17T22:06:41Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"6b2faf4f8d85e89964501a3eecd8e11228496ef2b882190b2b0dc9f9409435b4","abstract_canon_sha256":"6e310cb8fc44cc6a5e53bb187b37681223305daa72ba727fb20524a90f524e0e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:12:30.648808Z","signature_b64":"F8dEPAbIR9V+U6DyS/7hx76lhpAZCavotg2xWDQ7Keg0HQrNGpNlqIf3dDKzXFdhxoO3LV3c5LGFwMGLR8XoBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d1b3cf3e0a9eb919460a39ee65387d87be312198fa254731ef48933d4d57240f","last_reissued_at":"2026-06-19T16:12:30.648424Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:12:30.648424Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Toten: Knowledge-Based Ontological Tokenization Of Physical Quantities And Technical Notation In Brazilian Portuguese","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.AI","authors_text":"Antonio de Sousa Leit\\~ao Filho; Allan Kardec Duailibe Barros Filho; Fabr\\'icio Saul Lima; Selby Mykael Lima dos Santos; Rejani Bandeira Vieira Sousa","submitted_at":"2026-06-17T22:06:41Z","abstract_excerpt":"Byte-Pair Encoding tokenization is statistically efficient for vocabulary compression, but semantically blind to structured technical entities, fragmenting physical quantities, numbers, units, and symbolic expressions into lexically arbitrary subwords. We present TOTEN, a knowledge-based ontological tokenization framework that replaces statistical derivation with declarative classification grounded in a formal ontology of engineering entities (OEE). We formalize TOTEN as the triple <O, classify, {inst_tau}>: the ontology gathers types, structural principles, composition relations, and preserva"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.19626","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.19626/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.19626","created_at":"2026-06-19T16:12:30.648489+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.19626v1","created_at":"2026-06-19T16:12:30.648489+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.19626","created_at":"2026-06-19T16:12:30.648489+00:00"},{"alias_kind":"pith_short_12","alias_value":"2GZ46PQKT24R","created_at":"2026-06-19T16:12:30.648489+00:00"},{"alias_kind":"pith_short_16","alias_value":"2GZ46PQKT24RSRQK","created_at":"2026-06-19T16:12:30.648489+00:00"},{"alias_kind":"pith_short_8","alias_value":"2GZ46PQK","created_at":"2026-06-19T16:12:30.648489+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/2GZ46PQKT24RSRQKHHXGKOD5Q6","json":"https://pith.science/pith/2GZ46PQKT24RSRQKHHXGKOD5Q6.json","graph_json":"https://pith.science/api/pith-number/2GZ46PQKT24RSRQKHHXGKOD5Q6/graph.json","events_json":"https://pith.science/api/pith-number/2GZ46PQKT24RSRQKHHXGKOD5Q6/events.json","paper":"https://pith.science/paper/2GZ46PQK"},"agent_actions":{"view_html":"https://pith.science/pith/2GZ46PQKT24RSRQKHHXGKOD5Q6","download_json":"https://pith.science/pith/2GZ46PQKT24RSRQKHHXGKOD5Q6.json","view_paper":"https://pith.science/paper/2GZ46PQK","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.19626&json=true","fetch_graph":"https://pith.science/api/pith-number/2GZ46PQKT24RSRQKHHXGKOD5Q6/graph.json","fetch_events":"https://pith.science/api/pith-number/2GZ46PQKT24RSRQKHHXGKOD5Q6/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/2GZ46PQKT24RSRQKHHXGKOD5Q6/action/timestamp_anchor","attest_storage":"https://pith.science/pith/2GZ46PQKT24RSRQKHHXGKOD5Q6/action/storage_attestation","attest_author":"https://pith.science/pith/2GZ46PQKT24RSRQKHHXGKOD5Q6/action/author_attestation","sign_citation":"https://pith.science/pith/2GZ46PQKT24RSRQKHHXGKOD5Q6/action/citation_signature","submit_replication":"https://pith.science/pith/2GZ46PQKT24RSRQKHHXGKOD5Q6/action/replication_record"}},"created_at":"2026-06-19T16:12:30.648489+00:00","updated_at":"2026-06-19T16:12:30.648489+00:00"}