{"total":13,"items":[{"citing_arxiv_id":"2606.29982","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Uniform Experts: Cost-Aware Expert Execution for Efficient Multi-Device MoE Inference","primary_cat":"cs.DC","submitted_at":"2026-06-29T08:57:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CAEE reduces MoE inference latency 8-18% on 671B DeepSeek-R1 by cost-aware expert pruning and low-overhead compensation while keeping accuracy drop under 1%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05538","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Less is MoE: Trimming Experts in Domain-Specialist Language Models","primary_cat":"cs.LG","submitted_at":"2026-06-04T00:43:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Fisher-MoE prunes sparse intermediate dimensions in MoE FFNs ranked by Fisher importance, delivering 50% compression that preserves capability while cutting memory ~45% and raising throughput 21%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09886","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SHAPE: Coalition-Aware Expert Pruning for Sparse Mixture-of-Experts LLMs","primary_cat":"cs.LG","submitted_at":"2026-06-03T08:41:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SHAPE applies coalition-aware Shapley values to prune experts in MoE LLMs, retaining competitive accuracy at 20-40% pruning rates on Qwen3-30B-A3B, GPT-OSS-20B, and DeepSeek-V2-Lite without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01007","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Task-Agnostic: Task-Aware Grouping for Communication-Efficient Multi-Task MoE Inference","primary_cat":"cs.LG","submitted_at":"2026-05-31T04:51:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Task-aware expert grouping derived from family-specific co-activation traces cuts average communication cost 31.39% versus task-agnostic baselines in multi-task MoE inference while maintaining Jain fairness near 1.0.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30876","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"dMoE: dLLMs with Learnable Block Experts","primary_cat":"cs.CL","submitted_at":"2026-05-29T06:03:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"dMoE aggregates token expert distributions to block level in dLLMs, cutting unique experts from 69.5 to 14.6, memory by 76-80%, and latency by 1.14-1.66x while retaining 99.11% performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28207","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Pruning and Distilling Mixture-of-Experts into Dense Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-27T09:27:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A systematic MoE-to-dense conversion via expert scoring, grouping, and distillation yields +6.3 pp average accuracy over dense-to-dense pruning at matched parameter count on tested models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06542","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Does a Global Perspective Help Prune Sparse MoEs Elegantly?","primary_cat":"cs.CL","submitted_at":"2026-04-08T00:41:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GRAPE is a global redundancy-aware pruning strategy for sparse MoEs that dynamically allocates pruning budgets across layers and improves average accuracy by 1.40% over the best local baseline across tested models and settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04356","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"REAM: Merging Improves Pruning of Experts in LLMs","primary_cat":"cs.AI","submitted_at":"2026-04-06T02:08:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"REAM merges experts in MoE LLMs rather than pruning them, often matching uncompressed performance by tuning the mix of calibration data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02715","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FluxMoE: Decoupling Expert Residency for High-Performance MoE Serving","primary_cat":"cs.LG","submitted_at":"2026-04-03T04:16:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FluxMoE decouples MoE expert weights from persistent GPU residency via on-demand paging, achieving up to 3x throughput gains over vLLM in memory-constrained inference without accuracy loss.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"These approaches, however, assume that all expert parame- ters reside in GPU memory and do not address the memory pressure arising from large expert pools. FluxMoE is orthog- onal: it reduces per-GPU expert memory through streaming rather than distributing experts across additional GPUs. Model compression in LLMs. Lossy methods (e.g., quantization [ 2, 10, 16, 22, 29, 30, 34, 58] and pruning [9, 11, 37, 38, 44, 53]) offer high compression ratios but risk ac- curacy loss, limiting applicability in precision-critical deploy- ments. Lossless alternatives, such as ZipNN [20, 21] and LMC [49], apply Huffman coding [23] offline, but are not designed for active inference. DietGPU [ 1], NVComp [3], DFloat11 [59], and ZipServ [13] leverage lossless compression to accel-"},{"citing_arxiv_id":"2603.06003","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EvoESAP: Non-Uniform Expert Pruning for Sparse MoE","primary_cat":"cs.LG","submitted_at":"2026-03-06T08:02:58+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EvoESAP uses evolutionary search guided by a speculative-decoding-inspired ESAP metric to discover non-uniform layer-wise sparsity allocations for MoE expert pruning, improving generation accuracy up to 19.6% at 50% sparsity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.07379","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DuoServe-MoE: Dual-Phase Expert Prefetch and Caching for LLM Inference QoS Assurance","primary_cat":"cs.DC","submitted_at":"2025-09-09T04:00:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DuoServe-MoE decouples prefill and decode phases in MoE LLM inference with a two-stream CUDA pipeline for prefill and an offline-trained predictor for decode, reporting up to 5.34x TTFT and 7.55x end-to-end latency gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.12876","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MaskPro: Linear-Space Probabilistic Learning for Strict (N:M)-Sparsity on LLMs","primary_cat":"cs.LG","submitted_at":"2025-06-15T15:02:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MaskPro learns categorical distributions over groups of M weights to generate exact (N:M) sparsity via N-way sampling without replacement and stabilizes training with a moving average tracker of loss residuals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2411.08982","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Lynx: Enabling Efficient MoE Inference through Dynamic Batch-Aware Expert Selection","primary_cat":"cs.LG","submitted_at":"2024-11-13T19:18:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Lynx exploits training-induced batch-level expert activation skews via AffinityBinning to reduce invoked experts per batch, delivering up to 1.30x throughput with under 1% accuracy loss across four model families.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}