{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:PY4CMPEHL5PARIVUSMXKUZB4UL","short_pith_number":"pith:PY4CMPEH","schema_version":"1.0","canonical_sha256":"7e38263c875f5e08a2b4932eaa643ca2e9682c86fdd5af360e10938d520a49de","source":{"kind":"arxiv","id":"2606.01666","version":1},"attestation_state":"computed","paper":{"title":"DOT-MoE: Differentiable Optimal Transport for MoEfication","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Arnav Chavan, Aryamaan Thakur, Deepak Gupta, Steve Teig, Udbhav Bamba","submitted_at":"2026-06-01T04:19:16Z","abstract_excerpt":"The scaling of Large Language Models (LLMs) has driven significant performance gains but created substantial challenges in inference efficiency. While Mixture of Experts (MoEs) architectures address this by decoupling model size from inference cost, training MoEs from scratch is often unstable and compute intensive. Conversion of pre-trained dense models into sparse MoEs has emerged as an alternative solution; however, existing methods typically rely on heuristic neuron clustering or random splitting to partition the Feed-Forward Network (FFN) into experts. In this work, we propose DOT-MoE, a "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.01666","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-01T04:19:16Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"208e356267e6af967bcd896b2a16d0ed752602445ce28879f7c7b48a51940c1c","abstract_canon_sha256":"0ab7f74d026b87e1e0575392621c3753a7b59b04c56b0fa92f8627db1d2bd5e0"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T02:04:39.597367Z","signature_b64":"gpuOaXeKLgWd4MrgatdpCLFfptmxoaIiYLHGGYSPp/A1HcE+7xKKZrt800z+Lr/DeAKuWhXypmTr9XdgaoL0AA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7e38263c875f5e08a2b4932eaa643ca2e9682c86fdd5af360e10938d520a49de","last_reissued_at":"2026-06-02T02:04:39.597023Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T02:04:39.597023Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"DOT-MoE: Differentiable Optimal Transport for MoEfication","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Arnav Chavan, Aryamaan Thakur, Deepak Gupta, Steve Teig, Udbhav Bamba","submitted_at":"2026-06-01T04:19:16Z","abstract_excerpt":"The scaling of Large Language Models (LLMs) has driven significant performance gains but created substantial challenges in inference efficiency. While Mixture of Experts (MoEs) architectures address this by decoupling model size from inference cost, training MoEs from scratch is often unstable and compute intensive. Conversion of pre-trained dense models into sparse MoEs has emerged as an alternative solution; however, existing methods typically rely on heuristic neuron clustering or random splitting to partition the Feed-Forward Network (FFN) into experts. In this work, we propose DOT-MoE, a "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.01666","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.01666/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.01666","created_at":"2026-06-02T02:04:39.597075+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.01666v1","created_at":"2026-06-02T02:04:39.597075+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.01666","created_at":"2026-06-02T02:04:39.597075+00:00"},{"alias_kind":"pith_short_12","alias_value":"PY4CMPEHL5PA","created_at":"2026-06-02T02:04:39.597075+00:00"},{"alias_kind":"pith_short_16","alias_value":"PY4CMPEHL5PARIVU","created_at":"2026-06-02T02:04:39.597075+00:00"},{"alias_kind":"pith_short_8","alias_value":"PY4CMPEH","created_at":"2026-06-02T02:04:39.597075+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PY4CMPEHL5PARIVUSMXKUZB4UL","json":"https://pith.science/pith/PY4CMPEHL5PARIVUSMXKUZB4UL.json","graph_json":"https://pith.science/api/pith-number/PY4CMPEHL5PARIVUSMXKUZB4UL/graph.json","events_json":"https://pith.science/api/pith-number/PY4CMPEHL5PARIVUSMXKUZB4UL/events.json","paper":"https://pith.science/paper/PY4CMPEH"},"agent_actions":{"view_html":"https://pith.science/pith/PY4CMPEHL5PARIVUSMXKUZB4UL","download_json":"https://pith.science/pith/PY4CMPEHL5PARIVUSMXKUZB4UL.json","view_paper":"https://pith.science/paper/PY4CMPEH","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.01666&json=true","fetch_graph":"https://pith.science/api/pith-number/PY4CMPEHL5PARIVUSMXKUZB4UL/graph.json","fetch_events":"https://pith.science/api/pith-number/PY4CMPEHL5PARIVUSMXKUZB4UL/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PY4CMPEHL5PARIVUSMXKUZB4UL/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PY4CMPEHL5PARIVUSMXKUZB4UL/action/storage_attestation","attest_author":"https://pith.science/pith/PY4CMPEHL5PARIVUSMXKUZB4UL/action/author_attestation","sign_citation":"https://pith.science/pith/PY4CMPEHL5PARIVUSMXKUZB4UL/action/citation_signature","submit_replication":"https://pith.science/pith/PY4CMPEHL5PARIVUSMXKUZB4UL/action/replication_record"}},"created_at":"2026-06-02T02:04:39.597075+00:00","updated_at":"2026-06-02T02:04:39.597075+00:00"}