{"total":12,"items":[{"citing_arxiv_id":"2605.17923","ref_index":3,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"AdaptiveLoad: Towards Efficient Video Diffusion Transformer Training","primary_cat":"cs.DC","submitted_at":"2026-05-18T06:30:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AdaptiveLoad cuts computational imbalance in video DiT training from 39% to 18.9% and raises throughput 27.2% via memory-compute constraints and a custom LayerNorm-Modulate kernel.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15622","ref_index":119,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Position: Zeroth-Order Optimization in Deep Learning Is Underexplored, Not Underpowered","primary_cat":"cs.LG","submitted_at":"2026-05-15T05:11:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Zeroth-order optimization is underexplored rather than underpowered in deep learning, with limitations stemming from full-space designs that can be addressed via subspace, spectral, and systems-aware approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15250","ref_index":9,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"GQLA: Group-Query Latent Attention for Hardware-Adaptive Large Language Model Decoding","primary_cat":"cs.LG","submitted_at":"2026-05-14T15:50:01+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13080","ref_index":9,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Learning to See What You Need: Gaze Attention for Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-13T06:54:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Gaze Attention groups visual embeddings into selectable regions and dynamically restricts attention to task-relevant ones, matching dense baselines with up to 90% fewer visual KV entries via added context tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12464","ref_index":48,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Search Your Block Floating Point Scales!","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:50:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ScaleSearch optimizes block floating point scales via fine-grained search to cut quantization error by 27% for NVFP4, improving PTQ by up to 15 points on MATH500 for Qwen3-8B and attention PPL by 0.77 on Llama 3.1 70B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11408","ref_index":43,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"MaskTab: Scalable Masked Tabular Pretraining with Scaling Laws and Distillation for Industrial Classification","primary_cat":"cs.LG","submitted_at":"2026-05-12T01:56:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MaskTab is a masked pretraining method for industrial tabular data that delivers measurable gains in classification AUC and KS metrics while enabling effective distillation to smaller models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10247","ref_index":41,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Teaching LLMs to See Graphs: Unifying Text and Structural Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-11T09:19:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GTLM injects graph-aware attention biases into LLMs using only 0.015% extra parameters, enabling native graph processing that matches 7B models with a 1B model on text-attributed graph benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09472","ref_index":44,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Positional LSH: Binary Block Matrix Approximation for Attention with Linear Biases","primary_cat":"cs.LG","submitted_at":"2026-05-10T10:58:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ALiBi bias is the expectation of positional LSH-induced block masks, yielding spectral and max-norm approximation bounds that reduce long-context biased attention to randomized short-context unbiased attention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07859","ref_index":35,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"EyeCue: Driver Cognitive Distraction Detection via Gaze-Empowered Egocentric Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-08T15:20:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EyeCue detects driver cognitive distraction by modeling gaze-visual context interactions in egocentric videos and achieves 74.38% accuracy on the new CogDrive dataset, outperforming 11 baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21085","ref_index":84,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"climt-paraformer: Stable Emulation of Convective Parameterization using a Temporal Memory-aware Transformer","primary_cat":"physics.ao-ph","submitted_at":"2026-04-22T20:55:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A temporal memory-aware Transformer emulator for the Emanuel convective parameterization shows lower offline errors and 10-year stability in single-column model tests compared to memory-less MLP and LSTM baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20915","ref_index":15,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Absorber LLM: Harnessing Causal Synchronization for Test-Time Training","primary_cat":"cs.LG","submitted_at":"2026-04-22T02:58:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Absorber LLM introduces causal synchronization to absorb context into parameters for memory-efficient long-context LLM inference while preserving causal effects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19351","ref_index":25,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"DASH-KV: Accelerating Long-Context LLM Inference via Asymmetric KV Cache Hashing","primary_cat":"cs.CL","submitted_at":"2026-04-21T11:33:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DASH-KV accelerates long-context LLM inference to linear complexity via asymmetric KV cache hashing and mixed-precision retention, matching full attention performance on LongBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}