{"total":14,"items":[{"citing_arxiv_id":"2604.18562","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AnchorSeg: Language Grounded Query Banks for Reasoning Segmentation","primary_cat":"cs.CV","submitted_at":"2026-04-20T17:49:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AnchorSeg uses ordered query banks of latent reasoning tokens plus a spatial anchor token and a Token-Mask Cycle Consistency loss to achieve 67.7% gIoU and 68.1% cIoU on the ReasonSeg benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04058","ref_index":62,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MP-ISMoE: Mixed-Precision Interactive Side Mixture-of-Experts for Efficient Transfer Learning","primary_cat":"cs.LG","submitted_at":"2026-04-10T08:00:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MP-ISMoE uses Gaussian noise perturbed iterative quantization and interactive side mixture-of-experts to deliver higher accuracy than prior memory-efficient transfer learning methods while keeping similar parameter and memory usage.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.22123","ref_index":150,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Multilingual Vision-Language Models, A Survey","primary_cat":"cs.CL","submitted_at":"2025-09-26T09:46:13+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The survey identifies a key tension in multilingual vision-language models between language neutrality via contrastive learning and cultural awareness via diverse data, with most benchmarks relying on translation-based evaluation.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"[100] 28 17.2 72.0 83.6≠200K=2K emotional subjectivity M5(VGR, VLOD) Schneider and Sitaram [122] 12 16.8 61.7* 74.9≠1.4K≠6.8K culture SMPQA Geigle et al. [53] 11 28.9 61.3 76.9=7K=50 synthetic M3Exam Zhang et al. [167] 9 21.4 64.6 65.1≠2.8K≠3.1K general knowledge EXAMS-V Das et al. [33] 11 22.6 61.3 76.4≠5K≠1.2K general knowledge WorldCuisines Winata et al. [150] 24 44.8 65.5* 88.2=1M≠6K culture MLMemes Dimitrov et al. [40] 4 8.2 59.5 43.1≠25.3K≠10.8K persuation techniques xMMMU Yue et al. [160] 7 18.0 62.0 69.2=2.6K=300 semantics MTVQA Tang et al. [136] 10 16.4 62.0 77.9≠28.6K≠8.7K semantics CVQA Romero et al. [119] 31 40.9 67.7 88.2≠10.3K≠5.2K culture MVL-SIB Schmidt et al. [121] 199 70.2 70.8 95.4=3.1M=70 semantics"},{"citing_arxiv_id":"2411.04996","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mixture-of-Transformers: A Sparse and Scalable Architecture for Multi-Modal Foundation Models","primary_cat":"cs.CL","submitted_at":"2024-11-07T18:59:06+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MoT decouples non-embedding parameters by modality in transformers to match dense multi-modal performance with roughly one-third to one-half the FLOPs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2407.07726","ref_index":141,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PaliGemma: A versatile 3B VLM for transfer","primary_cat":"cs.CV","submitted_at":"2024-07-10T14:57:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"PaliGemma is an open 3B VLM based on SigLIP and Gemma that achieves strong performance on nearly 40 diverse open-world tasks including benchmarks, remote-sensing, and segmentation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.05410","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ChatSR: Multimodal Large Language Models for Scientific Formula Discovery","primary_cat":"cs.AI","submitted_at":"2024-06-08T09:17:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ChatSR aligns scientific data encoders with LLMs to produce formulas that fit data and satisfy explicit priors, reporting SOTA results on 13 symbolic regression benchmarks plus zero-shot handling of unseen prior types.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.03766","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MobileVLM V2: Faster and Stronger Baseline for Vision Language Model","primary_cat":"cs.CV","submitted_at":"2024-02-06T07:16:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MobileVLM V2 shows that 1.7B and 3B parameter vision-language models can reach or exceed the performance of 3B and 7B+ models on common VLM benchmarks via targeted design and data improvements.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.15947","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MoE-LLaVA: Mixture of Experts for Large Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2024-01-29T08:13:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MoE-LLaVA applies mixture-of-experts sparsity to LVLMs via MoE-Tuning, delivering LLaVA-1.5-7B level visual understanding and better hallucination resistance with only ~3B active parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.16588","ref_index":280,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Vision Transformers Need Registers","primary_cat":"cs.CV","submitted_at":"2023-09-28T16:45:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Adding register tokens to Vision Transformers eliminates high-norm background artifacts and raises state-of-the-art performance on dense visual prediction tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.18565","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PaLI-X: On Scaling up a Multilingual Vision and Language Model","primary_cat":"cs.CV","submitted_at":"2023-05-29T18:58:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Scaling a multilingual vision-language model in size and training breadth yields new state-of-the-art results on over 25 benchmarks plus emerging abilities in counting and multilingual detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2303.16199","ref_index":142,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention","primary_cat":"cs.CV","submitted_at":"2023-03-28T17:59:12+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLaMA-Adapter turns frozen LLaMA 7B into a capable instruction follower using only 1.2M new parameters and zero-init attention, matching Alpaca while extending to image-conditioned reasoning on ScienceQA and COCO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2302.14045","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Language Is Not All You Need: Aligning Perception with Language Models","primary_cat":"cs.CL","submitted_at":"2023-02-27T18:55:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Kosmos-1 shows strong zero-shot and few-shot results on language tasks, image captioning, visual QA, OCR-free document understanding, and image recognition guided by text instructions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2212.03191","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"InternVideo: General Video Foundation Models via Generative and Discriminative Learning","primary_cat":"cs.CV","submitted_at":"2022-12-06T18:09:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"InternVideo combines masked video modeling and video-language contrastive learning into a single foundation model that reaches state-of-the-art results on 39 video datasets including 91.1% top-1 on Kinetics-400.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2209.06794","ref_index":82,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PaLI: A Jointly-Scaled Multilingual Language-Image Model","primary_cat":"cs.CV","submitted_at":"2022-09-14T17:24:07+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PaLI jointly scales a 4B-parameter vision transformer with language models on a new 10B multilingual image-text dataset to reach state-of-the-art results on vision-language tasks while keeping a simple modular design.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}