{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:A3RUFTQ6WVIRBO6ZC3HFVN5AQB","short_pith_number":"pith:A3RUFTQ6","canonical_record":{"source":{"id":"2401.15947","kind":"arxiv","version":5},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-01-29T08:13:40Z","cross_cats_sorted":[],"title_canon_sha256":"7666fd243c25d39fbfa15fa0ea2bdcdad8661250b910e7af53030474f26307ae","abstract_canon_sha256":"2b5c7d7e6de1179e7a6278a47bb5ecd1807f557affaa2282f9da7fa2cc8a9dbe"},"schema_version":"1.0"},"canonical_sha256":"06e342ce1eb55110bbd916ce5ab7a08051bc14b44336aa96ce4d6db2c67a51a0","source":{"kind":"arxiv","id":"2401.15947","version":5},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2401.15947","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2401.15947v5","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.15947","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"A3RUFTQ6WVIR","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"A3RUFTQ6WVIRBO6Z","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"A3RUFTQ6","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:A3RUFTQ6WVIRBO6ZC3HFVN5AQB","target":"record","payload":{"canonical_record":{"source":{"id":"2401.15947","kind":"arxiv","version":5},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-01-29T08:13:40Z","cross_cats_sorted":[],"title_canon_sha256":"7666fd243c25d39fbfa15fa0ea2bdcdad8661250b910e7af53030474f26307ae","abstract_canon_sha256":"2b5c7d7e6de1179e7a6278a47bb5ecd1807f557affaa2282f9da7fa2cc8a9dbe"},"schema_version":"1.0"},"canonical_sha256":"06e342ce1eb55110bbd916ce5ab7a08051bc14b44336aa96ce4d6db2c67a51a0","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.360546Z","signature_b64":"fOpJU76HXP9Jc6fiamCrFAracEbOtKt2MwdPbFwUvYa+35en4QAH75VuIILLnM5QWqKLfNt8S/oN/WNwKm64Ag==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"06e342ce1eb55110bbd916ce5ab7a08051bc14b44336aa96ce4d6db2c67a51a0","last_reissued_at":"2026-05-17T23:38:49.359936Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.359936Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2401.15947","source_version":5,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Zl6DA98xVDOcazHK+eV55lrkmxXysLUDNJT11WbOJZM5T5c2hwDVdLWJwDSi+iOPyYHqiWJNPbUFo1B8N7PICw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T02:02:51.298966Z"},"content_sha256":"48a773958ed786f83e855c113f6900caa9343e4c075b7526c5894effdd617577","schema_version":"1.0","event_id":"sha256:48a773958ed786f83e855c113f6900caa9343e4c075b7526c5894effdd617577"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:A3RUFTQ6WVIRBO6ZC3HFVN5AQB","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"MoE-LLaVA: Mixture of Experts for Large Vision-Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A sparse vision-language model activates only 3 billion parameters yet matches the performance of a 7 billion parameter dense model on visual tasks.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Bin Lin, Jiebo Luo, Jinfa Huang, Junwu Zhang, Li Yuan, Munan Ning, Peng Jin, Yang Ye, Yatian Pang, Zhenyu Tang","submitted_at":"2024-01-29T08:13:40Z","abstract_excerpt":"Recent advances demonstrate that scaling Large Vision-Language Models (LVLMs) effectively improves downstream task performances. However, existing scaling methods enable all model parameters to be active for each token in the calculation, which brings massive training and inferring costs. In this work, we propose a simple yet effective training strategy MoE-Tuning for LVLMs. This strategy innovatively addresses the common issue of performance degradation in multi-modal sparsity learning, consequently constructing a sparse model with an outrageous number of parameters but a constant computation"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"With only approximately 3B sparsely activated parameters, MoE-LLaVA demonstrates performance comparable to the LLaVA-1.5-7B on various visual understanding datasets and even surpasses the LLaVA-1.5-13B in object hallucination benchmark.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the proposed MoE-Tuning strategy reliably prevents the performance degradation typically observed when applying sparsity to multi-modal models, allowing the sparse model to retain dense-model capability.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"MoE-LLaVA applies mixture-of-experts sparsity to LVLMs via MoE-Tuning, delivering LLaVA-1.5-7B level visual understanding and better hallucination resistance with only ~3B active parameters.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A sparse vision-language model activates only 3 billion parameters yet matches the performance of a 7 billion parameter dense model on visual tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"c06b8357b423b75aa4e61e70421a21596e969d2763a750365c41f41f9d9ed6c6"},"source":{"id":"2401.15947","kind":"arxiv","version":5},"verdict":{"id":"5a31450a-60be-4112-b55c-e94105b921a6","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T02:28:13.774116Z","strongest_claim":"With only approximately 3B sparsely activated parameters, MoE-LLaVA demonstrates performance comparable to the LLaVA-1.5-7B on various visual understanding datasets and even surpasses the LLaVA-1.5-13B in object hallucination benchmark.","one_line_summary":"MoE-LLaVA applies mixture-of-experts sparsity to LVLMs via MoE-Tuning, delivering LLaVA-1.5-7B level visual understanding and better hallucination resistance with only ~3B active parameters.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the proposed MoE-Tuning strategy reliably prevents the performance degradation typically observed when applying sparsity to multi-modal models, allowing the sparse model to retain dense-model capability.","pith_extraction_headline":"A sparse vision-language model activates only 3 billion parameters yet matches the performance of a 7 billion parameter dense model on visual tasks."},"references":{"count":53,"sample":[{"doi":"","year":null,"title":"arXiv preprint arXiv:1809.10853 , year=","work_id":"84b64f8d-525e-4a27-b130-855b07cc501f","ref_index":1,"cited_arxiv_id":"1809.10853","is_internal_anchor":true},{"doi":"","year":null,"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","ref_index":2,"cited_arxiv_id":"2309.16609","is_internal_anchor":true},{"doi":"","year":1901,"title":"D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al","work_id":"9806adeb-7378-4bee-a184-3e98c89988dd","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Honeybee: Locality-enhanced projector for multimodal llm","work_id":"fc187c8a-06af-4a3b-b67d-c5db265b73f4","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Eve: Efficient vision-language pre-training with masked prediction and modality-aware moe","work_id":"1cd41cc6-49db-4692-979a-c767bae9f319","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":53,"snapshot_sha256":"8491c85b02ce8f1fa3a5995aa38b781c8d9f73b3e29d37522ec3daff4d5af346","internal_anchors":23},"formal_canon":{"evidence_count":2,"snapshot_sha256":"27f01d0f152c2f745fd788e30937e7d73ad0cc5820d119cc3ea1a60b714efff2"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"5a31450a-60be-4112-b55c-e94105b921a6"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"BdazB6MRNABxdynxJmVA96eO+CX34X687i5fTuD72P4yMd3pH0AJyQ4aCzzkblZ115WUVEEYWPOszBZGqa70CA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T02:02:51.299792Z"},"content_sha256":"eafc5def41abbf27f4de57af528dfb6d7f67f48e544577647f53383a80093d8f","schema_version":"1.0","event_id":"sha256:eafc5def41abbf27f4de57af528dfb6d7f67f48e544577647f53383a80093d8f"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/A3RUFTQ6WVIRBO6ZC3HFVN5AQB/bundle.json","state_url":"https://pith.science/pith/A3RUFTQ6WVIRBO6ZC3HFVN5AQB/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/A3RUFTQ6WVIRBO6ZC3HFVN5AQB/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-05T02:02:51Z","links":{"resolver":"https://pith.science/pith/A3RUFTQ6WVIRBO6ZC3HFVN5AQB","bundle":"https://pith.science/pith/A3RUFTQ6WVIRBO6ZC3HFVN5AQB/bundle.json","state":"https://pith.science/pith/A3RUFTQ6WVIRBO6ZC3HFVN5AQB/state.json","well_known_bundle":"https://pith.science/.well-known/pith/A3RUFTQ6WVIRBO6ZC3HFVN5AQB/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:A3RUFTQ6WVIRBO6ZC3HFVN5AQB","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"2b5c7d7e6de1179e7a6278a47bb5ecd1807f557affaa2282f9da7fa2cc8a9dbe","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-01-29T08:13:40Z","title_canon_sha256":"7666fd243c25d39fbfa15fa0ea2bdcdad8661250b910e7af53030474f26307ae"},"schema_version":"1.0","source":{"id":"2401.15947","kind":"arxiv","version":5}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2401.15947","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2401.15947v5","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.15947","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"A3RUFTQ6WVIR","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"A3RUFTQ6WVIRBO6Z","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"A3RUFTQ6","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:eafc5def41abbf27f4de57af528dfb6d7f67f48e544577647f53383a80093d8f","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"With only approximately 3B sparsely activated parameters, MoE-LLaVA demonstrates performance comparable to the LLaVA-1.5-7B on various visual understanding datasets and even surpasses the LLaVA-1.5-13B in object hallucination benchmark."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the proposed MoE-Tuning strategy reliably prevents the performance degradation typically observed when applying sparsity to multi-modal models, allowing the sparse model to retain dense-model capability."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MoE-LLaVA applies mixture-of-experts sparsity to LVLMs via MoE-Tuning, delivering LLaVA-1.5-7B level visual understanding and better hallucination resistance with only ~3B active parameters."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A sparse vision-language model activates only 3 billion parameters yet matches the performance of a 7 billion parameter dense model on visual tasks."}],"snapshot_sha256":"c06b8357b423b75aa4e61e70421a21596e969d2763a750365c41f41f9d9ed6c6"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"27f01d0f152c2f745fd788e30937e7d73ad0cc5820d119cc3ea1a60b714efff2"},"paper":{"abstract_excerpt":"Recent advances demonstrate that scaling Large Vision-Language Models (LVLMs) effectively improves downstream task performances. However, existing scaling methods enable all model parameters to be active for each token in the calculation, which brings massive training and inferring costs. In this work, we propose a simple yet effective training strategy MoE-Tuning for LVLMs. This strategy innovatively addresses the common issue of performance degradation in multi-modal sparsity learning, consequently constructing a sparse model with an outrageous number of parameters but a constant computation","authors_text":"Bin Lin, Jiebo Luo, Jinfa Huang, Junwu Zhang, Li Yuan, Munan Ning, Peng Jin, Yang Ye, Yatian Pang, Zhenyu Tang","cross_cats":[],"headline":"A sparse vision-language model activates only 3 billion parameters yet matches the performance of a 7 billion parameter dense model on visual tasks.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-01-29T08:13:40Z","title":"MoE-LLaVA: Mixture of Experts for Large Vision-Language Models"},"references":{"count":53,"internal_anchors":23,"resolved_work":53,"sample":[{"cited_arxiv_id":"1809.10853","doi":"","is_internal_anchor":true,"ref_index":1,"title":"arXiv preprint arXiv:1809.10853 , year=","work_id":"84b64f8d-525e-4a27-b130-855b07cc501f","year":null},{"cited_arxiv_id":"2309.16609","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al","work_id":"9806adeb-7378-4bee-a184-3e98c89988dd","year":1901},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Honeybee: Locality-enhanced projector for multimodal llm","work_id":"fc187c8a-06af-4a3b-b67d-c5db265b73f4","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Eve: Efficient vision-language pre-training with masked prediction and modality-aware moe","work_id":"1cd41cc6-49db-4692-979a-c767bae9f319","year":2023}],"snapshot_sha256":"8491c85b02ce8f1fa3a5995aa38b781c8d9f73b3e29d37522ec3daff4d5af346"},"source":{"id":"2401.15947","kind":"arxiv","version":5},"verdict":{"created_at":"2026-05-16T02:28:13.774116Z","id":"5a31450a-60be-4112-b55c-e94105b921a6","model_set":{"reader":"grok-4.3"},"one_line_summary":"MoE-LLaVA applies mixture-of-experts sparsity to LVLMs via MoE-Tuning, delivering LLaVA-1.5-7B level visual understanding and better hallucination resistance with only ~3B active parameters.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A sparse vision-language model activates only 3 billion parameters yet matches the performance of a 7 billion parameter dense model on visual tasks.","strongest_claim":"With only approximately 3B sparsely activated parameters, MoE-LLaVA demonstrates performance comparable to the LLaVA-1.5-7B on various visual understanding datasets and even surpasses the LLaVA-1.5-13B in object hallucination benchmark.","weakest_assumption":"That the proposed MoE-Tuning strategy reliably prevents the performance degradation typically observed when applying sparsity to multi-modal models, allowing the sparse model to retain dense-model capability."}},"verdict_id":"5a31450a-60be-4112-b55c-e94105b921a6"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:48a773958ed786f83e855c113f6900caa9343e4c075b7526c5894effdd617577","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"2b5c7d7e6de1179e7a6278a47bb5ecd1807f557affaa2282f9da7fa2cc8a9dbe","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-01-29T08:13:40Z","title_canon_sha256":"7666fd243c25d39fbfa15fa0ea2bdcdad8661250b910e7af53030474f26307ae"},"schema_version":"1.0","source":{"id":"2401.15947","kind":"arxiv","version":5}},"canonical_sha256":"06e342ce1eb55110bbd916ce5ab7a08051bc14b44336aa96ce4d6db2c67a51a0","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"06e342ce1eb55110bbd916ce5ab7a08051bc14b44336aa96ce4d6db2c67a51a0","first_computed_at":"2026-05-17T23:38:49.359936Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.359936Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"fOpJU76HXP9Jc6fiamCrFAracEbOtKt2MwdPbFwUvYa+35en4QAH75VuIILLnM5QWqKLfNt8S/oN/WNwKm64Ag==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.360546Z","signed_message":"canonical_sha256_bytes"},"source_id":"2401.15947","source_kind":"arxiv","source_version":5}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:48a773958ed786f83e855c113f6900caa9343e4c075b7526c5894effdd617577","sha256:eafc5def41abbf27f4de57af528dfb6d7f67f48e544577647f53383a80093d8f"],"state_sha256":"f4fdf6ab19c3bcdce907b548bbd2cb2b3414da27dcbaf79b45d5c17518128c59"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"sslskt5uHiNKaeVQEZtAqqozweC/b7Vluv7q2hkdUJTzL9xgUOveUzAlSiCWetefhsJXtpNrL5SPodz6iYXjCQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-05T02:02:51.302980Z","bundle_sha256":"6196fdc60ef2903875efc975605fa60c062ed155f077d358b86abc9c22098c5e"}}