{"total":104,"items":[{"citing_arxiv_id":"2606.27206","ref_index":77,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Syntactic Belief Update as the Driver of Garden Path Processing Difficulty","primary_cat":"cs.CL","submitted_at":"2026-06-25T16:02:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Syntactic belief update via generalized Rényi divergence on syntactic trees predicts garden path reading times better than lexical surprisal.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13097","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Functional Cache Grafting: Robust and Rapid Code-Policy Synthesis for Embodied Agents","primary_cat":"cs.PL","submitted_at":"2026-06-11T09:25:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FCGraft synthesizes code policies for embodied agents by grafting KV caches from a library of validated functions, claiming 18.31% higher success rate and 2.3x faster synthesis than prompt-level caching.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02953","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Linguistic Productivity in Large Language Models: Models Coerce, but do not Preempt","primary_cat":"cs.CL","submitted_at":"2026-06-01T23:11:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Larger LLMs reproduce constructional productivity via entrenchment in coercion cases with nonce words but fail to use statistical preemption to avoid overgeneralizing semantically plausible but unobserved patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02305","ref_index":131,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mapping Whisper Representations to Human ECoG Responses with Interpretable Time-Resolved Neural Encoding","primary_cat":"q-bio.NC","submitted_at":"2026-06-01T14:25:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper introduces a time-resolved neural encoder combining Whisper embeddings with recurrent temporal modeling and soft attention to predict ECoG responses, finding strongest alignment in intermediate layers and anatomically coherent phoneme organization in electrodes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30729","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SemStruct: Contextualizing Semantic Embeddings with Structural Information for Schema Matching","primary_cat":"cs.LG","submitted_at":"2026-05-29T01:45:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SemStruct models tables as heterogeneous graphs with GNNs on frozen PLM embeddings to incorporate row co-occurrences for schema matching and reports SOTA results on Valentine and SOTAB-SM benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22984","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Test-Time Training Undermines Safety Guardrails","primary_cat":"cs.LG","submitted_at":"2026-05-21T19:27:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Test-time training enables three new threat models that raise jailbreak attack success rates on language models to averages of 95% and 93% ASR@10 under LoRA for few-shot and generation-phase attacks across model families.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22823","ref_index":67,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Which Way Did It Move? Diagnosing and Overcoming Directional Motion Blindness in Video-LLMs","primary_cat":"cs.CV","submitted_at":"2026-05-21T17:59:56+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Video-LLMs exhibit directional motion blindness from a direction binding gap; DeltaDirect projector objective lifts synthetic accuracy to 85.4% and real accuracy by 21.9 points while preserving other video capabilities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19042","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Interference-Aware Multi-Task Unlearning","primary_cat":"cs.AI","submitted_at":"2026-05-18T19:05:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces interference-aware multi-task unlearning with task-aware gradient projection and instance-level gradient orthogonalization, reducing interference scores by 30.3% and 52.9% on vision benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18932","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HypergraphFormer: Learning Hypergraphs from LLMs for Editable Floor Plan Generation","primary_cat":"cs.LG","submitted_at":"2026-05-18T15:00:29+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16468","ref_index":68,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mechanistically Interpretable Neural Encoding Reveals Fine-Grained Functional Selectivity in Human Visual Cortex","primary_cat":"cs.CV","submitted_at":"2026-05-15T11:28:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MINE uses mechanistic interpretability on language-aligned image representations to generate per-voxel feature descriptions, validated via image generation and counterfactual edits that causally shift brain activation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15230","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EnergyAgentBench: Benchmarking LLM Agents on Live Energy Infrastructure Data","primary_cat":"econ.EM","submitted_at":"2026-05-13T18:03:51+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"EnergyAgentBench is a new benchmark with 70 task variants that evaluates LLM agents on live energy data for datacenter siting, long-horizon optimization, and causal grid diagnosis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13429","ref_index":90,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TokAlign++: Advancing Vocabulary Adaptation via Better Token Alignment","primary_cat":"cs.CL","submitted_at":"2026-05-13T12:23:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TokAlign++ learns token alignments between LLM vocabularies from monolingual representations to enable faster adaptation, better text compression, and effective token-level distillation across 15 languages with minimal steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11277","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sieve: Dynamic Expert-Aware PIM Acceleration for Evolving Mixture-of-Experts Models","primary_cat":"cs.AR","submitted_at":"2026-05-11T22:00:39+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Sieve dynamically schedules MoE experts across GPU and PIM hardware to handle bimodal token distributions, achieving 1.3x to 1.6x gains in throughput and interactivity over static prior PIM systems on three large models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"HBM-PIM stacks, 𝑆⊆𝐸 the subset assigned to PIM, and 𝐺=𝐸−𝑆 the subset assigned to the GPU. TheSievescheduler finds the partition 𝑆 ∗ that minimizes the bottleneck across three components: 𝑆 ∗ =arg min 𝑆⊆𝐸 max \u0010 𝑇Comm, 𝑇 GPU (𝐺), 𝑇 PIM (𝑆) \u0011 (1) Prior work has proposed highly fine-grained overlap of com- putation and communication to maximize GPU utilization during MoE inference [54, 57]. Accordingly, theSievescheduler uses the 6 Sieve: Dynamic Expert-Aware PIM Acceleration for Evolving Mixture-of-Experts Models maximum estimated execution time across the interconnect, GPU, and PIM. Since theSievescheduler runs on the critical path, it is designed to identify the dominant bottleneck with low overhead rather than exactly predict execution time."},{"citing_arxiv_id":"2605.10825","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Large Spectrum Models (LSMs): Decoder-Only Transformer-Powered Spectrum Activity Forecasting via Tokenized RF Data","primary_cat":"cs.NI","submitted_at":"2026-05-11T16:43:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Decoder-only transformers trained on tokenized RF spectrum data from 22 TB of measurements achieve 3.25 dB RMSE in spectrum activity forecasting across 33 bands.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"computing resources we have for this work. Based on these criteria, we select the following five mod- els: (1) Gemma-2B from Google DeepMind [11], (2) GPT- 2 from OpenAI [12], (3) LLaMA-7B from Meta [13], [40], (4) Mistral-7B from Mistral AI [14], and (5) Phi-1 from Microsoft [15] All five models are available through the Hugging Face transformers library [41]. In Figs. 4, we present the models utilized for training and evaluating the spectrum dataset. Each model represents a scaled implementation of its original architecture, modified as necessary to accommodate task and computational con- straints. However, the key properties of the original models are preserved. Consequently, we prepend each model name with"},{"citing_arxiv_id":"2605.08975","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Latency Analysis and Optimization of Alpamayo 1 via Efficient Trajectory Generation","primary_cat":"cs.AI","submitted_at":"2026-05-09T14:34:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Redesigning Alpamayo 1 to single-reasoning and optimizing diffusion action generation cuts inference latency by 69.23% while preserving trajectory diversity and prediction quality.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"processing and postprocessing modules involve no learned components. For simplicity, positional encodings, which inject token position information into the embeddings, are omitted from Fig. 1. Specifically, the vision encoder and the language decoder apply positional encodings internally, while the action decoder receives its position encodings externally, computed by the language decoder. DeepStack [30] is also omitted, which is a technique that fuses visual features from multiple layers of the vision encoder into the early layers of the language decoder to improve visual understanding during the prefill phase. C. Generating Multiple Trajectories Multi-reasoning (N:N).Alpamayo uses the multi-reasoning approach for generating multiple trajectories."},{"citing_arxiv_id":"2605.08776","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reasoning Compression with Mixed-Policy Distillation","primary_cat":"cs.AI","submitted_at":"2026-05-09T08:04:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Mixed-Policy Distillation transfers concise reasoning behavior from larger to smaller LLMs by having the teacher compress student-generated trajectories, cutting token usage up to 27% while raising benchmark scores.","context_count":1,"top_context_role":"other","top_context_polarity":"unclear","context_text":"on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada, pages 8024-8035, 2019. [35] Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, Rémi Louf, Morgan Funtowicz, and Jamie Brew. Huggingface's transformers: State-of- the-art natural language processing.CoRR, abs/1910.03771, 2019. [36] Woosuk Kwon.vLLM: An Efficient Inference Engine for Large Language Models. PhD thesis, UC Berkeley, 2025. [37] Siyan Zhao, Zhihui Xie, Mengchen Liu, Jing Huang, Guan Pang, Feiyu Chen, and Aditya Grover. Self- distilled reasoner: On-policy self-distillation for large language models.CoRR, abs/2601.18734, 2026. 12 Table 6: Hyper-parameter configurations for MPD."},{"citing_arxiv_id":"2605.08636","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EdgeFlowerTune: Evaluating Federated LLM Fine-Tuning Under Realistic Edge System Constraints","primary_cat":"cs.CL","submitted_at":"2026-05-09T03:02:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EdgeFlowerTune is a real-device benchmark that jointly assesses model quality and system costs for federated LLM fine-tuning on edge hardware using three protocols: Quality-under-Budget, Cost-to-Target, and Robustness.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"To support system-aware benchmarking, EdgeFlowerTune is built as a real-device federated LLM fine-tuning platform rather than a simulation-only environment. As shown in Figure 3, the platform consists of a GPU server, a cross-platform communication layer, and heterogeneous edge clients. The server is a Dell PowerEdge T640 equipped with two NVIDIA A800 GPUs. It runsFlower[ 10] for federated orchestration and usesPyTorchandTransformers[ 28] for model execution and aggregation. 4 Figure 3: EdgeFlowerTune Platform. The platform consists of one gpu server and several real edge devices including android smartphones and NVIDIA boards. Table 1: Heterogeneous client devices in the EdgeFlowerTune platform. Devices are ordered from faster to slower execution speed in our testbed. Device Model Processor / CPU Memory"},{"citing_arxiv_id":"2605.07096","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Query-efficient model evaluation using cached responses","primary_cat":"cs.LG","submitted_at":"2026-05-08T01:24:06+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07075","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ModelLens: Finding the Best for Your Task from Myriads of Models","primary_cat":"cs.LG","submitted_at":"2026-05-08T00:49:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ModelLens learns a performance-aware latent space from 1.62M leaderboard records to rank unseen models on unseen datasets without forward passes on the target.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"across diverse QA benchmarks. Case studies on recently released benchmarks further confirm generalization to both text and vision-language tasks. 1 Introduction The rapid growth of open-source machine learning models has created an unprecedented opportunity for practitioners to build, customize, and deploy AI systems [1, 2]. Platforms such as HuggingFace [3] now host hundreds of thousands of models spanning diverse architectures, scales, and application domains. Faced with a new task or dataset, practitioners must decide which model to adopt or fine- tune for their specific use case. Despite its importance, this decision remains notoriously difficult, and typically demands extensive empirical evaluation or ad-hoc trial-and-error [4, 5]."},{"citing_arxiv_id":"2605.06992","ref_index":114,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Why Does Agentic Safety Fail to Generalize Across Tasks?","primary_cat":"cs.LG","submitted_at":"2026-05-07T22:16:03+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Agentic safety fails to generalize across tasks because the task-to-safe-controller mapping has a higher Lipschitz constant than the task-to-controller mapping alone, as proven in linear-quadratic control and demonstrated in quadcopter and LLM experiments.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"V oyager: An open-ended embodied agent with large language models.arXiv preprint arXiv:2305.16291, 2023. [113] Jane X. Wang, Zeb Kurth-Nelson, Dhruva Tirumala, Hubert Soyer, Joel Z. Leibo, Rémi Munos, Charles Blundell, Dharshan Kumaran, and Matt Botvinick. Learning to reinforcement learn. InProceedings of the Annual Conference of the Cognitive Science Society (CogSci), 2016. [114] Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, Rémi Louf, Morgan Funtowicz, et al. Huggingface's transformers: State-of-the-art natural language processing.arXiv preprint arXiv:1910.03771, 2019. [115] Huan Xu and Shie Mannor. Robustness and generalization.Machine learning, 86(3):391-423, 2012."},{"citing_arxiv_id":"2605.06664","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BAMI: Training-Free Bias Mitigation in GUI Grounding","primary_cat":"cs.CV","submitted_at":"2026-05-07T17:59:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BAMI mitigates precision and ambiguity biases in GUI grounding via coarse-to-fine focus and candidate selection, raising accuracy on ScreenSpot-Pro without training.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"algorithm of BAMI is detailed in Algorithm 1. 5. Experiment 5.1. Experimental Setup ModelsThe proposed BAMI method aims to enhance the accuracy of Grounding models without retraining. We tested this method on several state-of-the-art grounding models, including OS-Atlas-7B [ 32], UI-TARS-1.5-7B [ 22], and TianXi-Action-7B [27]. All models were implemented using the Transformers framework [30] for inference. The input to the models consists of both the query and the screenshot. OS- Atlas and TianXi-Action output bounding box coordinates, while UI-TARS outputs click coordinates. Coarse Prediction Crop Box Cropped Image 1. Coarse-to-Fine Focus Instruction ScreenShot Task Given the UI screenshot, provide the position related to the command \"launch"},{"citing_arxiv_id":"2605.06605","ref_index":67,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How Many Iterations to Jailbreak? Dynamic Budget Allocation for Multi-Turn LLM Evaluation","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:25:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DAPRO provides the first dynamic, theoretically guaranteed way to allocate interaction budgets across test cases for bounding time-to-event in multi-turn LLM evaluations, achieving tighter coverage than static conformal survival methods.","context_count":1,"top_context_role":"other","top_context_polarity":"unclear","context_text":"Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pages 784-789, Melbourne, Australia, July 2018. Association for Computational Linguistics. [66] Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, et al. Qwen technical report.arXiv preprint arXiv:2309.16609, 2023. [67] Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, Rémi Louf, Morgan Funtowicz, et al. Huggingface's transform- ers: State-of-the-art natural language processing.arXiv preprint arXiv:1910.03771, 2019. [68] Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu,"},{"citing_arxiv_id":"2605.06068","ref_index":73,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VibeServe: Can AI Agents Build Bespoke LLM Serving Systems?","primary_cat":"cs.AI","submitted_at":"2026-05-07T11:54:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"VibeServe demonstrates that AI agents can synthesize bespoke LLM serving systems end-to-end, remaining competitive with vLLM in standard settings while outperforming it in six non-standard scenarios involving unusual models, workloads, or hardware.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"§A gives more details. • Scenario A: Standard LLM serving.We serve Llama-3.1-8B-Instruct [ 19] on an NVIDIA H100, stress-testing VibeServe in a mature setting where existing systems are heavily optimized. We verify greedy-decoding outputs and measure generation throughput across arrival rates. • Scenario B: Code editing with predicted outputs.We serve Qwen3-32B [ 73] on an NVIDIA H100 using a predicted-outputs interface [54]. Code-editing workloads often exhibit large overlap between the input context, such as the original file, and the generated edit [13, 74, 66]. We generate a system to exploit this via speculative decoding from user-provided predictions, a capability absent from standard serving systems. We measure single-batch latency on CodeEditorBench [20]."},{"citing_arxiv_id":"2605.05638","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scaling Pretrained Representations Enables Label-Free Out-of-Distribution Detection Without Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-07T03:45:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Scaling pretrained representations improves label-free OOD detection on frozen backbones, causing performance gaps between global and local detectors to vanish across vision and language tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05115","ref_index":266,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Manifold Steering Reveals the Shared Geometry of Neural Network Representation and Behavior","primary_cat":"cs.LG","submitted_at":"2026-05-06T16:46:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Manifold steering along activation geometry induces behavioral trajectories matching the natural manifold of outputs, while linear steering produces off-manifold unnatural behaviors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04901","ref_index":118,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the (In-)Security of the Shuffling Defense in the Transformer Secure Inference","primary_cat":"cs.CR","submitted_at":"2026-05-06T13:31:15+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An attack aligns differently shuffled intermediate activations from secure Transformer inference queries to recover model weights with low error using roughly one dollar of queries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01732","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"EGAD: Entropy-Guided Adaptive Distillation for Token-Level Knowledge Transfer","primary_cat":"cs.CL","submitted_at":"2026-05-03T06:05:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EGAD adaptively distills LLM knowledge at the token level by using entropy to create a curriculum from low- to high-entropy tokens, adjust temperature, and switch between logits-only and feature-based branches.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"into a GPT-2 model with 760M parameters, an OPT model with 13B parame- ters is distilled into a 2.7B parameter variant, and a LLaMA3 model with 13B parameters is distilled into a 8B parameter variant. Implementation Details.All experimental evaluations are performed with the PyTorch deep learning framework [26], in combination with the Hugging Face Transformers toolkit [37]. The computational tasks are run on a single NVIDIA A800 GPU with 80 GB of memory. We set the batch size to 32 and train the model for 10 epochs using the AdamW optimizer [22], with a learning rate of 5×10 −6 and a weight decay of1.0×10−2. We sett0 to half of the total training steps, withT min = 1andT max = 5, and setλ= 0.5. The ratio of low-entropy"},{"citing_arxiv_id":"2604.28109","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Auto-FlexSwitch: Efficient Dynamic Model Merging via Learnable Task Vector Compression","primary_cat":"cs.LG","submitted_at":"2026-04-30T16:58:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Auto-FlexSwitch achieves efficient dynamic model merging by decomposing task vectors into sparse masks, signs, and scalars, then making the compression learnable via gating and adaptive bit selection with KNN-based retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26217","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OpenSOC-AI: Democratizing Security Operations with Parameter Efficient LLM Log Analysis","primary_cat":"cs.CR","submitted_at":"2026-04-29T01:46:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LoRA fine-tuning of TinyLlama-1.1B on 450 SOC examples produces 68% threat classification accuracy and 58% severity accuracy on 50 held-out logs, with full code, weights, and data released.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25872","ref_index":91,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Errors Can Be Beneficial: A Categorization of Imperfect Rewards for Policy Gradient","primary_cat":"cs.LG","submitted_at":"2026-04-28T17:10:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Certain errors in proxy rewards for policy gradient methods can be benign or beneficial by preventing policies from stalling on outputs with mediocre ground truth rewards, enabling improved RLHF metrics and reward design insights.","context_count":1,"top_context_role":"other","top_context_polarity":"unclear","context_text":"[89] Xueru Wen, Jie Lou, Yaojie Lu, Hongyu Lin, Xing Yu, Xinyu Lu, Ben He, Xianpei Han, Debing Zhang, and Le Sun. Rethinking reward model evaluation: Are we barking up the wrong tree? InInternational Conference on Learning Representations, 2025. [90] Ronald J Williams. Simple statistical gradient-following algorithms for connectionist reinforcement learning.Machine learning, 8(3):229-256, 1992. [91] Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, Rémi Louf, Morgan Funtowicz, et al. Huggingface's transformers: State-of-the-art natural language processing.arXiv preprint arXiv:1910.03771, 2019. [92] Blake Woodworth, Suriya Gunasekar, Jason D Lee, Edward Moroshko, Pedro Savarese, Itay Golan, Daniel"},{"citing_arxiv_id":"2604.24678","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Leveraging LLMs for Multi-File DSL Code Generation: An Industrial Case Study","primary_cat":"cs.SE","submitted_at":"2026-04-27T16:38:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fine-tuning 7B code LLMs on a custom multi-file DSL dataset achieves structural fidelity of 1.00, high exact-match accuracy, and practical utility validated by expert survey and execution checks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Anthony Moi, Pierric Cistac, Tim Rault, Rémi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. HuggingFace's Transformers: State-of-the-art Natural Language Processing. arXiv:1910.03771 [cs.CL] https://arxiv.org/abs/1910.03771 [45] Lingling Xu, Haoran Xie, Si-Zhao Joe Qin, Xiaohui Tao, and Fu Lee Wang. 2023. Parameter-Efficient Fine-Tuning Methods for Pretrained Language Models: A Critical Review and Assessment. arXiv:2312.12148 [cs.CL] https://arxiv.org/abs/ 2312.12148 [46] Minghao Yan, Zhuang Wang, Zhen Jia, Shivaram Venkataraman, and Yida Wang. 2025. PLoRA: Efficient LoRA Hyperparameter Tuning for Large Models."},{"citing_arxiv_id":"2604.21901","ref_index":62,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GiVA: Gradient-Informed Bases for Vector-Based Adaptation","primary_cat":"cs.CL","submitted_at":"2026-04-23T17:48:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GiVA uses gradients to initialize vector adapters so they match LoRA performance at eight times lower rank while keeping extreme parameter efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20696","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"R-CoV: Region-Aware Chain-of-Verification for Alleviating Object Hallucinations in LVLMs","primary_cat":"cs.CV","submitted_at":"2026-04-22T15:41:33+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"R-CoV is a six-step region-aware chain-of-verification technique that elicits coordinate and description outputs from LVLMs themselves to detect and reduce object hallucinations without external models or retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18525","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Towards Better Static Code Analysis Reports: Sentence Transformer-based Filtering of Non-Actionable Alerts","primary_cat":"cs.SE","submitted_at":"2026-04-20T17:22:41+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"STAF applies sentence embeddings from transformers to classify SCA findings, reaching 89% F1 and beating prior filters by 11% within projects and 6% across projects.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[44] Xin Zhang, Yanzhao Zhang, Dingkun Long, Wen Xie, Ziqi Dai, Jialong Tang, Huan Lin, Baosong Yang, Pengjun Xie, Fei Huang, and Others. 2024. mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval. InProceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track. 1393-1412. [45] Feng Zhangyin, Guo Daya, Tang Duyu, Duan Nan, Feng Xiaocheng, Gong Ming, Shou Linjun, Qin Bing, Liu Ting, n Jiang Daxi, and Zhou Ming. 2020. CodeBERT: A Pre-Trained Model for Programming and Natural Languages. InFindings of the Association for Computational Linguistics: EMNLP 2020, Online Event, 16-20 November 2020 (Findings of ACL, Vol. EMNLP 2020), Trevor Cohn, Yulan He, and"},{"citing_arxiv_id":"2604.17725","ref_index":116,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RePrompT: Recurrent Prompt Tuning for Integrating Structured EHR Encoders with Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-20T02:20:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RePrompT uses recurrent prompt tuning to inject prior-visit latent states and cohort-derived population prompt tokens into LLMs, yielding better performance than pure EHR or pure LLM baselines on MIMIC clinical prediction tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15499","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SecureRouter: Encrypted Routing for Efficient Secure Inference","primary_cat":"cs.CR","submitted_at":"2026-04-16T20:18:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SecureRouter accelerates secure transformer inference by 1.95x via an encrypted router that selects input-adaptive models from an MPC-optimized pool with negligible accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13950","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Causal Drawbridges: Characterizing Gradient Blocking of Syntactic Islands in Transformer LMs","primary_cat":"cs.CL","submitted_at":"2026-04-15T15:03:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Causal interventions reveal that coordination islands block filler-gap mechanisms in Transformers in a gradient way matching humans, yielding the hypothesis that 'and' encodes relational dependencies differently in extractable vs. conjunctive uses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13549","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reconstruction of a 3D wireframe from a single line drawing via generative depth estimation","primary_cat":"cs.CV","submitted_at":"2026-04-15T06:52:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A latent diffusion model conditioned on line drawings estimates dense depth to reconstruct 3D wireframes, reporting 5.3% average depth error after training on over one million pairs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10733","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Too Nice to Tell the Truth: Quantifying Agreeableness-Driven Sycophancy in Role-Playing Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-12T17:12:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Agreeableness in AI personas reliably predicts sycophantic behavior in 9 of 13 tested language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08299","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SeLaR: Selective Latent Reasoning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-09T14:32:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SeLaR selectively applies latent soft reasoning in LLMs via entropy gating and contrastive regularization, outperforming standard CoT on five benchmarks without training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07955","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rethinking Residual Errors in Compensation-based LLM Quantization","primary_cat":"cs.LG","submitted_at":"2026-04-09T08:20:59+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Redefining residual errors to include compensation-aware discrepancies and realigning calibration to full-precision outputs improves GPTQ and GPTAQ performance on LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06896","ref_index":68,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VertAX: a differentiable vertex model for learning epithelial tissue mechanics","primary_cat":"cs.LG","submitted_at":"2026-04-08T09:56:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VertAX supplies a differentiable JAX implementation of vertex models for confluent epithelia that enables forward simulation, mechanical parameter inference, and inverse design of tissue-scale behaviors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06819","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond End-to-End: Dynamic Chain Optimization for Private LLM Adaptation on the Edge","primary_cat":"cs.DC","submitted_at":"2026-04-08T08:37:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ChainFed achieves memory-efficient private LLM fine-tuning on edge devices through sequential layer-by-layer adapter training with dynamic co-tuning, perceptive optimization, and adaptive starting point selection, improving accuracy by up to 46.46%.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Our experiments are conducted on an on-device FL system composed of heterogeneous hardware (Tam et al., 2024; Tian et al., 2024; Zhan et al., 2024). These devices possess diverse memory and compu- tational capacities, faithfully emulating real-world deployment conditions. All implementations are built using PyTorch and the Hugging Face Trans- formers library (Wolf, 2019; Xu et al., 2026a,c,b; Wu et al., 2026). D.3 Implementation Details For CHAINFED, we set the following hyperparam- eters unless specified otherwise. The GPO loss balancing weight λ is set to 0.1 for BERT and 0.2 for other models. The FOAT threshold T is set to 0.9 for BERT and 0.8 for others. The DLCT co- tuning window size, Q, is determined based on the"},{"citing_arxiv_id":"2604.06297","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"FedSpy-LLM: Towards Scalable and Generalizable Data Reconstruction Attacks from Gradients on LLMs","primary_cat":"cs.CR","submitted_at":"2026-04-07T17:19:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FedSpy-LLM uses gradient decomposition and iterative alignment to reconstruct larger batches and longer sequences of training data from LLM gradients in federated settings, including with PEFT methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02527","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Jump Start or False Start? A Theoretical and Empirical Evaluation of LLM-initialized Bandits","primary_cat":"cs.LG","submitted_at":"2026-04-02T21:27:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM warm-starts for bandits remain better than cold-starts up to roughly 30% random label noise but increase regret under systematic misalignment, with a derived sufficient condition on prior error that predicts when the warm-start helps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.01989","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Attention at Rest Stays at Rest: Breaking Visual Inertia for Cognitive Hallucination Mitigation","primary_cat":"cs.CV","submitted_at":"2026-04-02T12:51:07+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"+MemVR [68] 78.60+2.13 78.73+2.12 78.40+4.03 78.42+3.91 85.47+3.60 85.56+3.58 Adversarial +Ours 80.53+4.0682.17+5.5680.50+6.1381.70+7.1988.17+6.3088.23+6.25 framework. For fair comparison, all methods are implemented using the default hyperparameters from their official repositories. Implementation Details:We implement IVE using HuggingFace Transform- ers [48] and integrate it with beam search for decoding. All experiments are conducted on 8 NVIDIA H800 GPUs. The EMA smoothing coefficientγis fixed at 0.1, and the selection thresholdτis set to 3.0. The penalty scoreαis tuned for each model and benchmark. Additional implementation details and experimental results are provided in the appendix. 5.2 Main Results"},{"citing_arxiv_id":"2603.29493","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MemFactory: Unified Inference & Training Framework for Agent Memory","primary_cat":"cs.CL","submitted_at":"2026-03-31T09:38:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MemFactory is a new unified modular framework for memory-augmented LLM agent inference and training that integrates GRPO and reports up to 14.8% relative gains on MemAgent evaluations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00831","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GhostServe: A Lightweight Checkpointing System in the Shadow for Fault-Tolerant LLM Serving","primary_cat":"cs.DC","submitted_at":"2026-03-26T13:27:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GhostServe applies erasure coding to KV cache in host memory for fast recovery from failures in LLM serving, cutting checkpointing latency up to 2.7x and recovery latency 2.1x versus prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.10960","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Ranking Reasoning LLMs under Test-Time Scaling","primary_cat":"cs.LG","submitted_at":"2026-03-11T16:47:41+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Many established statistical ranking techniques produce orderings of reasoning LLMs under test-time scaling that closely match a Bayesian gold standard, with mean Kendall tau_b of 0.93-0.95 at full trials and best methods reaching 0.86 at single trials.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.20816","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Don't Ignore the Tail: Decoupling top-K Probabilities for Efficient Language Model Distillation","primary_cat":"cs.CL","submitted_at":"2026-02-24T11:54:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A modified divergence decouples top-K teacher probabilities from the distribution tail during distillation, yielding competitive performance on decoder models with standard compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}