{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:YS4AO72QXJS654NAJABZW3TQNX","short_pith_number":"pith:YS4AO72Q","schema_version":"1.0","canonical_sha256":"c4b8077f50ba65eef1a048039b6e706df90eb5882285b3154f35c43028a90ad8","source":{"kind":"arxiv","id":"2602.23353","version":2},"attestation_state":"computed","paper":{"title":"SOTAlign: Semi-Supervised Alignment of Unimodal Vision and Language Models via Optimal Transport","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Paul Krzakala, Quentin Bouniot, Simon Roschmann, Sonia Mazelet, Zeynep Akata","submitted_at":"2026-02-26T18:55:06Z","abstract_excerpt":"The Platonic Representation Hypothesis posits that neural networks trained on different modalities converge toward a shared statistical model of the world. Recent work exploits this convergence by aligning frozen pretrained vision and language models with lightweight alignment layers, but typically relies on contrastive losses and millions of paired samples. In this work, we ask whether meaningful alignment can be achieved with substantially less supervision. We introduce a semi-supervised setting in which pretrained unimodal encoders are aligned using a small number of image-text pairs togeth"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2602.23353","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-02-26T18:55:06Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"fe0d8b0ec9918fb1ea40c2f823afabf96b2aeb5939e0dd8b277b55bb3cc0cdd9","abstract_canon_sha256":"a360e0eebbe18b2d153fb1dd09688a069bf854b444705d4c700b455acd5d5b03"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T02:18:07.210325Z","signature_b64":"ggl1Q5zGAzRy69xPyT6//0SghB3Ljs0rHr8DquLu8sRMAIhUShrCP90GmoWTHyLNPgZqQx/3lKGjrZu4+0oyDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c4b8077f50ba65eef1a048039b6e706df90eb5882285b3154f35c43028a90ad8","last_reissued_at":"2026-06-30T02:18:07.209767Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T02:18:07.209767Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"SOTAlign: Semi-Supervised Alignment of Unimodal Vision and Language Models via Optimal Transport","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Paul Krzakala, Quentin Bouniot, Simon Roschmann, Sonia Mazelet, Zeynep Akata","submitted_at":"2026-02-26T18:55:06Z","abstract_excerpt":"The Platonic Representation Hypothesis posits that neural networks trained on different modalities converge toward a shared statistical model of the world. Recent work exploits this convergence by aligning frozen pretrained vision and language models with lightweight alignment layers, but typically relies on contrastive losses and millions of paired samples. In this work, we ask whether meaningful alignment can be achieved with substantially less supervision. We introduce a semi-supervised setting in which pretrained unimodal encoders are aligned using a small number of image-text pairs togeth"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.23353","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.23353/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2602.23353","created_at":"2026-06-30T02:18:07.209832+00:00"},{"alias_kind":"arxiv_version","alias_value":"2602.23353v2","created_at":"2026-06-30T02:18:07.209832+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.23353","created_at":"2026-06-30T02:18:07.209832+00:00"},{"alias_kind":"pith_short_12","alias_value":"YS4AO72QXJS6","created_at":"2026-06-30T02:18:07.209832+00:00"},{"alias_kind":"pith_short_16","alias_value":"YS4AO72QXJS654NA","created_at":"2026-06-30T02:18:07.209832+00:00"},{"alias_kind":"pith_short_8","alias_value":"YS4AO72Q","created_at":"2026-06-30T02:18:07.209832+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2605.19752","citing_title":"MSAlign: Aligning Molecule and Mass Spectra Foundation Models for Metabolite Identification","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26370","citing_title":"Topology-Aware Representation Alignment for Semi-Supervised Vision-Language Learning","ref_index":21,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YS4AO72QXJS654NAJABZW3TQNX","json":"https://pith.science/pith/YS4AO72QXJS654NAJABZW3TQNX.json","graph_json":"https://pith.science/api/pith-number/YS4AO72QXJS654NAJABZW3TQNX/graph.json","events_json":"https://pith.science/api/pith-number/YS4AO72QXJS654NAJABZW3TQNX/events.json","paper":"https://pith.science/paper/YS4AO72Q"},"agent_actions":{"view_html":"https://pith.science/pith/YS4AO72QXJS654NAJABZW3TQNX","download_json":"https://pith.science/pith/YS4AO72QXJS654NAJABZW3TQNX.json","view_paper":"https://pith.science/paper/YS4AO72Q","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2602.23353&json=true","fetch_graph":"https://pith.science/api/pith-number/YS4AO72QXJS654NAJABZW3TQNX/graph.json","fetch_events":"https://pith.science/api/pith-number/YS4AO72QXJS654NAJABZW3TQNX/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YS4AO72QXJS654NAJABZW3TQNX/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YS4AO72QXJS654NAJABZW3TQNX/action/storage_attestation","attest_author":"https://pith.science/pith/YS4AO72QXJS654NAJABZW3TQNX/action/author_attestation","sign_citation":"https://pith.science/pith/YS4AO72QXJS654NAJABZW3TQNX/action/citation_signature","submit_replication":"https://pith.science/pith/YS4AO72QXJS654NAJABZW3TQNX/action/replication_record"}},"created_at":"2026-06-30T02:18:07.209832+00:00","updated_at":"2026-06-30T02:18:07.209832+00:00"}