{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2022:AQQT67GG4YZLIOL7PRVW2GAHME","short_pith_number":"pith:AQQT67GG","schema_version":"1.0","canonical_sha256":"04213f7cc6e632b4397f7c6b6d1807611457e552036dbd3efb8a231d78f6afee","source":{"kind":"arxiv","id":"2205.01818","version":2},"attestation_state":"computed","paper":{"title":"i-Code: An Integrative and Composable Multimodal Learning Framework","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.CV","eess.AS"],"primary_cat":"cs.LG","authors_text":"Bin Xiao, Chenguang Zhu, Dongdong Chen, Liyang Lu, Lu Yuan, Mei Gao, Michael Zeng, Naoyuki Kanda, Noel Codella, Reid Pryzant, Robert Gmyr, Takuya Yoshioka, Xuedong Huang, Yao Qian, Yichong Xu, Yi-Ling Chen, Yujia Xie, Yu Shi, Yuwei Fang, Ziyi Yang","submitted_at":"2022-05-03T23:38:50Z","abstract_excerpt":"Human intelligence is multimodal; we integrate visual, linguistic, and acoustic signals to maintain a holistic worldview. Most current pretraining methods, however, are limited to one or two modalities. We present i-Code, a self-supervised pretraining framework where users may flexibly combine the modalities of vision, speech, and language into unified and general-purpose vector representations. In this framework, data from each modality are first given to pretrained single-modality encoders. The encoder outputs are then integrated with a multimodal fusion network, which uses novel attention m"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2205.01818","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2022-05-03T23:38:50Z","cross_cats_sorted":["cs.AI","cs.CL","cs.CV","eess.AS"],"title_canon_sha256":"772ef49dc5afd5ab77635e52e19fab794754c5e9a24f9106487f9307345fbb2f","abstract_canon_sha256":"021874672ade548cff8829d5cf0c1807b3882eea29e495cd7149bf0aafb5e446"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T04:20:35.107231Z","signature_b64":"Ozd8czRSNSElB1SGNj/GL+1Ii6poXnGe5yRfyYhoy7mH4KQTP3XgbyKqvhevy4tHUOTwUhwkuuNEz8oyDxT/Dw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"04213f7cc6e632b4397f7c6b6d1807611457e552036dbd3efb8a231d78f6afee","last_reissued_at":"2026-07-05T04:20:35.106766Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T04:20:35.106766Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"i-Code: An Integrative and Composable Multimodal Learning Framework","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.CV","eess.AS"],"primary_cat":"cs.LG","authors_text":"Bin Xiao, Chenguang Zhu, Dongdong Chen, Liyang Lu, Lu Yuan, Mei Gao, Michael Zeng, Naoyuki Kanda, Noel Codella, Reid Pryzant, Robert Gmyr, Takuya Yoshioka, Xuedong Huang, Yao Qian, Yichong Xu, Yi-Ling Chen, Yujia Xie, Yu Shi, Yuwei Fang, Ziyi Yang","submitted_at":"2022-05-03T23:38:50Z","abstract_excerpt":"Human intelligence is multimodal; we integrate visual, linguistic, and acoustic signals to maintain a holistic worldview. Most current pretraining methods, however, are limited to one or two modalities. We present i-Code, a self-supervised pretraining framework where users may flexibly combine the modalities of vision, speech, and language into unified and general-purpose vector representations. In this framework, data from each modality are first given to pretrained single-modality encoders. The encoder outputs are then integrated with a multimodal fusion network, which uses novel attention m"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2205.01818","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2205.01818/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2205.01818","created_at":"2026-07-05T04:20:35.106825+00:00"},{"alias_kind":"arxiv_version","alias_value":"2205.01818v2","created_at":"2026-07-05T04:20:35.106825+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2205.01818","created_at":"2026-07-05T04:20:35.106825+00:00"},{"alias_kind":"pith_short_12","alias_value":"AQQT67GG4YZL","created_at":"2026-07-05T04:20:35.106825+00:00"},{"alias_kind":"pith_short_16","alias_value":"AQQT67GG4YZLIOL7","created_at":"2026-07-05T04:20:35.106825+00:00"},{"alias_kind":"pith_short_8","alias_value":"AQQT67GG","created_at":"2026-07-05T04:20:35.106825+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/AQQT67GG4YZLIOL7PRVW2GAHME","json":"https://pith.science/pith/AQQT67GG4YZLIOL7PRVW2GAHME.json","graph_json":"https://pith.science/api/pith-number/AQQT67GG4YZLIOL7PRVW2GAHME/graph.json","events_json":"https://pith.science/api/pith-number/AQQT67GG4YZLIOL7PRVW2GAHME/events.json","paper":"https://pith.science/paper/AQQT67GG"},"agent_actions":{"view_html":"https://pith.science/pith/AQQT67GG4YZLIOL7PRVW2GAHME","download_json":"https://pith.science/pith/AQQT67GG4YZLIOL7PRVW2GAHME.json","view_paper":"https://pith.science/paper/AQQT67GG","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2205.01818&json=true","fetch_graph":"https://pith.science/api/pith-number/AQQT67GG4YZLIOL7PRVW2GAHME/graph.json","fetch_events":"https://pith.science/api/pith-number/AQQT67GG4YZLIOL7PRVW2GAHME/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/AQQT67GG4YZLIOL7PRVW2GAHME/action/timestamp_anchor","attest_storage":"https://pith.science/pith/AQQT67GG4YZLIOL7PRVW2GAHME/action/storage_attestation","attest_author":"https://pith.science/pith/AQQT67GG4YZLIOL7PRVW2GAHME/action/author_attestation","sign_citation":"https://pith.science/pith/AQQT67GG4YZLIOL7PRVW2GAHME/action/citation_signature","submit_replication":"https://pith.science/pith/AQQT67GG4YZLIOL7PRVW2GAHME/action/replication_record"}},"created_at":"2026-07-05T04:20:35.106825+00:00","updated_at":"2026-07-05T04:20:35.106825+00:00"}