{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:LXPUFYFTMCWJADFKRYQS623XWM","short_pith_number":"pith:LXPUFYFT","schema_version":"1.0","canonical_sha256":"5ddf42e0b360ac900caa8e212f6b77b33cd7bb5477d481a2486b48049ace0e88","source":{"kind":"arxiv","id":"2605.25263","version":1},"attestation_state":"computed","paper":{"title":"Mimir: Large-scale Multilingual Concept Modeling","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Elio Musacchio, Lucia Siciliani, Pierpaolo Basile","submitted_at":"2026-05-24T21:26:47Z","abstract_excerpt":"Current language modeling approaches are built around tokens. Text corpora are split into tokens, and models are trained by performing computations on these tokens, such as predicting the next token given the preceding ones as context. This paradigm has become the standard in modern language modeling, especially given the outstanding performance obtained by token-based architectures. However, recent works have not only begun to question how language models process and understand meaning from tokens, but also to question whether using higher levels of granularity could advance the research fiel"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.25263","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-24T21:26:47Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"aa4d332fc81f54b45fb45ea73534b8a1b1eb669ae652d9555676a96148d27bba","abstract_canon_sha256":"42ef6429633140fe7518f8030674c2f06ca40ec90513e2702079d8563351c853"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-26T02:04:26.147532Z","signature_b64":"WZfw20YMW9AbRbwa6shuOxZDtnSFbuKN3rhkq8mgmE3WCA+fAEqoV8mRlIHnBuzD+2nIs3SNfoFPdLyXnl1oAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5ddf42e0b360ac900caa8e212f6b77b33cd7bb5477d481a2486b48049ace0e88","last_reissued_at":"2026-05-26T02:04:26.146791Z","signature_status":"signed_v1","first_computed_at":"2026-05-26T02:04:26.146791Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Mimir: Large-scale Multilingual Concept Modeling","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Elio Musacchio, Lucia Siciliani, Pierpaolo Basile","submitted_at":"2026-05-24T21:26:47Z","abstract_excerpt":"Current language modeling approaches are built around tokens. Text corpora are split into tokens, and models are trained by performing computations on these tokens, such as predicting the next token given the preceding ones as context. This paradigm has become the standard in modern language modeling, especially given the outstanding performance obtained by token-based architectures. However, recent works have not only begun to question how language models process and understand meaning from tokens, but also to question whether using higher levels of granularity could advance the research fiel"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.25263","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.25263/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.25263","created_at":"2026-05-26T02:04:26.146909+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.25263v1","created_at":"2026-05-26T02:04:26.146909+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.25263","created_at":"2026-05-26T02:04:26.146909+00:00"},{"alias_kind":"pith_short_12","alias_value":"LXPUFYFTMCWJ","created_at":"2026-05-26T02:04:26.146909+00:00"},{"alias_kind":"pith_short_16","alias_value":"LXPUFYFTMCWJADFK","created_at":"2026-05-26T02:04:26.146909+00:00"},{"alias_kind":"pith_short_8","alias_value":"LXPUFYFT","created_at":"2026-05-26T02:04:26.146909+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/LXPUFYFTMCWJADFKRYQS623XWM","json":"https://pith.science/pith/LXPUFYFTMCWJADFKRYQS623XWM.json","graph_json":"https://pith.science/api/pith-number/LXPUFYFTMCWJADFKRYQS623XWM/graph.json","events_json":"https://pith.science/api/pith-number/LXPUFYFTMCWJADFKRYQS623XWM/events.json","paper":"https://pith.science/paper/LXPUFYFT"},"agent_actions":{"view_html":"https://pith.science/pith/LXPUFYFTMCWJADFKRYQS623XWM","download_json":"https://pith.science/pith/LXPUFYFTMCWJADFKRYQS623XWM.json","view_paper":"https://pith.science/paper/LXPUFYFT","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.25263&json=true","fetch_graph":"https://pith.science/api/pith-number/LXPUFYFTMCWJADFKRYQS623XWM/graph.json","fetch_events":"https://pith.science/api/pith-number/LXPUFYFTMCWJADFKRYQS623XWM/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/LXPUFYFTMCWJADFKRYQS623XWM/action/timestamp_anchor","attest_storage":"https://pith.science/pith/LXPUFYFTMCWJADFKRYQS623XWM/action/storage_attestation","attest_author":"https://pith.science/pith/LXPUFYFTMCWJADFKRYQS623XWM/action/author_attestation","sign_citation":"https://pith.science/pith/LXPUFYFTMCWJADFKRYQS623XWM/action/citation_signature","submit_replication":"https://pith.science/pith/LXPUFYFTMCWJADFKRYQS623XWM/action/replication_record"}},"created_at":"2026-05-26T02:04:26.146909+00:00","updated_at":"2026-05-26T02:04:26.146909+00:00"}