{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:T5JTOKIIROO7PXI4MGSNTKUQZ2","short_pith_number":"pith:T5JTOKII","schema_version":"1.0","canonical_sha256":"9f533729088b9df7dd1c61a4d9aa90cebfe2740ec7903a498e8e7deb45b4ac47","source":{"kind":"arxiv","id":"2410.01529","version":1},"attestation_state":"computed","paper":{"title":"Robo-MUTUAL: Robotic Multimodal Task Specification via Unimodal Learning","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.RO","authors_text":"Guanglu Song, Guanming Wang, Jianxiong Li, Jingjing Liu, Jinliang Zheng, Junzhi Yu, Xianyuan Zhan, Xiaoai Zhou, Ya-Qin Zhang, Yu Liu, Zhihao Wang","submitted_at":"2024-10-02T13:23:02Z","abstract_excerpt":"Multimodal task specification is essential for enhanced robotic performance, where \\textit{Cross-modality Alignment} enables the robot to holistically understand complex task instructions. Directly annotating multimodal instructions for model training proves impractical, due to the sparsity of paired multimodal data. In this study, we demonstrate that by leveraging unimodal instructions abundant in real data, we can effectively teach robots to learn multimodal task specifications. First, we endow the robot with strong \\textit{Cross-modality Alignment} capabilities, by pretraining a robotic mul"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2410.01529","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.RO","submitted_at":"2024-10-02T13:23:02Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"d9bde40c827de79376a700eacd51a5e07682ed957fb03f0dc8cc876857e91ae0","abstract_canon_sha256":"dc517cf2b13b21dd2b30ac0d6121cb171da416ea6695af3306816856b4482613"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T09:14:48.970922Z","signature_b64":"qbOO2zB2aoh9CZ+yiG4mtgw/8f0vaBnK+oQQkkIRz+b5O8Idr6+1y1MzW8QXdc9OvdKEkLKG851BqbOsi96nBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9f533729088b9df7dd1c61a4d9aa90cebfe2740ec7903a498e8e7deb45b4ac47","last_reissued_at":"2026-07-05T09:14:48.970375Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T09:14:48.970375Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Robo-MUTUAL: Robotic Multimodal Task Specification via Unimodal Learning","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.RO","authors_text":"Guanglu Song, Guanming Wang, Jianxiong Li, Jingjing Liu, Jinliang Zheng, Junzhi Yu, Xianyuan Zhan, Xiaoai Zhou, Ya-Qin Zhang, Yu Liu, Zhihao Wang","submitted_at":"2024-10-02T13:23:02Z","abstract_excerpt":"Multimodal task specification is essential for enhanced robotic performance, where \\textit{Cross-modality Alignment} enables the robot to holistically understand complex task instructions. Directly annotating multimodal instructions for model training proves impractical, due to the sparsity of paired multimodal data. In this study, we demonstrate that by leveraging unimodal instructions abundant in real data, we can effectively teach robots to learn multimodal task specifications. First, we endow the robot with strong \\textit{Cross-modality Alignment} capabilities, by pretraining a robotic mul"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2410.01529","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2410.01529/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2410.01529","created_at":"2026-07-05T09:14:48.970427+00:00"},{"alias_kind":"arxiv_version","alias_value":"2410.01529v1","created_at":"2026-07-05T09:14:48.970427+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.01529","created_at":"2026-07-05T09:14:48.970427+00:00"},{"alias_kind":"pith_short_12","alias_value":"T5JTOKIIROO7","created_at":"2026-07-05T09:14:48.970427+00:00"},{"alias_kind":"pith_short_16","alias_value":"T5JTOKIIROO7PXI4","created_at":"2026-07-05T09:14:48.970427+00:00"},{"alias_kind":"pith_short_8","alias_value":"T5JTOKII","created_at":"2026-07-05T09:14:48.970427+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/T5JTOKIIROO7PXI4MGSNTKUQZ2","json":"https://pith.science/pith/T5JTOKIIROO7PXI4MGSNTKUQZ2.json","graph_json":"https://pith.science/api/pith-number/T5JTOKIIROO7PXI4MGSNTKUQZ2/graph.json","events_json":"https://pith.science/api/pith-number/T5JTOKIIROO7PXI4MGSNTKUQZ2/events.json","paper":"https://pith.science/paper/T5JTOKII"},"agent_actions":{"view_html":"https://pith.science/pith/T5JTOKIIROO7PXI4MGSNTKUQZ2","download_json":"https://pith.science/pith/T5JTOKIIROO7PXI4MGSNTKUQZ2.json","view_paper":"https://pith.science/paper/T5JTOKII","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2410.01529&json=true","fetch_graph":"https://pith.science/api/pith-number/T5JTOKIIROO7PXI4MGSNTKUQZ2/graph.json","fetch_events":"https://pith.science/api/pith-number/T5JTOKIIROO7PXI4MGSNTKUQZ2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/T5JTOKIIROO7PXI4MGSNTKUQZ2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/T5JTOKIIROO7PXI4MGSNTKUQZ2/action/storage_attestation","attest_author":"https://pith.science/pith/T5JTOKIIROO7PXI4MGSNTKUQZ2/action/author_attestation","sign_citation":"https://pith.science/pith/T5JTOKIIROO7PXI4MGSNTKUQZ2/action/citation_signature","submit_replication":"https://pith.science/pith/T5JTOKIIROO7PXI4MGSNTKUQZ2/action/replication_record"}},"created_at":"2026-07-05T09:14:48.970427+00:00","updated_at":"2026-07-05T09:14:48.970427+00:00"}