{"work":{"id":"82eb1f7e-d598-409b-9fec-8a7e82965d26","openalex_id":null,"doi":null,"arxiv_id":"2411.04996","raw_key":null,"title":"Mixture-of-Transformers: A Sparse and Scalable Architecture for Multi-Modal Foundation Models","authors":null,"authors_text":"Weixin Liang, Lili Yu, Liang Luo, Srinivasan Iyer, Ning Dong, Chunting Zhou","year":2024,"venue":"cs.CL","abstract":"The development of large language models (LLMs) has expanded to multi-modal systems capable of processing text, images, and speech within a unified framework. Training these models demands significantly larger datasets and computational resources compared to text-only LLMs. To address the scaling challenges, we introduce Mixture-of-Transformers (MoT), a sparse multi-modal transformer architecture that significantly reduces pretraining computational costs. MoT decouples non-embedding parameters of the model by modality -- including feed-forward networks, attention matrices, and layer normalization -- enabling modality-specific processing with global self-attention over the full input sequence. We evaluate MoT across multiple settings and model scales. In the Chameleon 7B setting (autoregressive text-and-image generation), MoT matches the dense baseline's performance using only 55.8\\% of the FLOPs. When extended to include speech, MoT reaches speech performance comparable to the dense baseline with only 37.2\\% of the FLOPs. In the Transfusion setting, where text and image are trained with different objectives, a 7B MoT model matches the image modality performance of the dense baseline with one third of the FLOPs, and a 760M MoT model outperforms a 1.4B dense baseline across key image generation metrics. System profiling further highlights MoT's practical benefits, achieving dense baseline image quality in 47.2\\% of the wall-clock time and text quality in 75.6\\% of the wall-clock time (measured on AWS p4de.24xlarge instances with NVIDIA A100 GPUs).","external_url":"https://arxiv.org/abs/2411.04996","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T05:10:22.298873+00:00","pith_arxiv_id":"2411.04996","created_at":"2026-05-10T09:28:39.274836+00:00","updated_at":"2026-05-25T05:10:22.298873+00:00","title_quality_ok":true,"display_title":"Mixture-of-Transformers: A Sparse and Scalable Architecture for Multi-Modal Foundation Models","render_title":"Mixture-of-Transformers: A Sparse and Scalable Architecture for Multi-Modal Foundation Models"},"hub":{"state":{"work_id":"82eb1f7e-d598-409b-9fec-8a7e82965d26","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":18,"external_cited_by_count":null,"distinct_field_count":2,"first_pith_cited_at":"2025-05-08T17:58:57+00:00","last_pith_cited_at":"2026-05-22T02:56:34+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-05T15:19:13.865888+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":4},{"context_role":"method","n":2},{"context_role":"baseline","n":1}],"polarity_counts":[{"context_polarity":"background","n":4},{"context_polarity":"use_method","n":2},{"context_polarity":"baseline","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}