{"total":379,"items":[{"citing_arxiv_id":"2606.26396","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"At the Edge of Understanding: Sparse Autoencoders Trace The Limits of Transformer Generalization","primary_cat":"cs.LG","submitted_at":"2026-06-24T21:26:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Sparse autoencoders show OOD prompts increase fallacious concept activation in transformers, offering a mechanistic measure of shift and a path to robust fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11675","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Lung-R1: A Knowledge Graph-Guided LLM for Pulmonary Diagnostic Reasoning","primary_cat":"cs.AI","submitted_at":"2026-06-10T05:39:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces the first structured pulmonary knowledge graph LungKG and uses it to train Lung-R1, which reaches SOTA on EMR-based pulmonary diagnosis tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06315","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLM Self-Recognition: Steering and Retrieving Activation Signatures","primary_cat":"cs.AI","submitted_at":"2026-06-04T15:54:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Steering LLM residual streams with random sparse vectors creates detectable self-recognition fingerprints that enable over 98% accurate attribution of generated text to specific models without degrading output quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09881","ref_index":58,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Toward Calibrated, Fair, and accurate Deepfake Detection","primary_cat":"cs.LG","submitted_at":"2026-06-03T05:44:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Face-Feature Tuning is a label-free logit remapping method that reduces FPR/TPR gaps across groups in deepfake detection while preserving overall accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02951","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SCOPE: Real-Time Natural Language Camera Agent at the Edge","primary_cat":"cs.RO","submitted_at":"2026-06-01T23:07:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SCOPE introduces an edge-deployable natural-language PTZ camera agent, a simulation benchmark, and evaluations showing that stronger small language models reduce hallucinations while perception remains the main bottleneck.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01202","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Shape of Wisdom: Decision Trajectories in Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-31T12:33:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A 9,000-trajectory study across three LLMs finds correctness and stability differ, with the largest group unstable-correct and attention scalars aligning better than MLPs in stable cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00869","ref_index":67,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enhancing LLM Metacognition via Cognitive Pairwise Training","primary_cat":"cs.LG","submitted_at":"2026-05-30T19:53:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CPT is introduced as a pairwise reasoning-trace comparison stage that improves the reasoning-metacognition trade-off over standard SFT+RL pipelines across model scales.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00819","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mitigating Hallucinations in Large Language Models Via Decoder Layer Skipping","primary_cat":"cs.AI","submitted_at":"2026-05-30T17:40:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeLask dynamically skips hallucination-prone decoder layers in LLMs by measuring gradient driftance via cosine similarity and partially aggregating states instead of full skipping.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00651","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MESA: Improving MoE Safety Alignment via Decentralized Expertise","primary_cat":"cs.LG","submitted_at":"2026-05-30T09:54:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MESA decentralizes safety duties in MoE LLMs via expert capacity reallocation and dynamic routing refinement based on optimal transport theory, yielding robust defense on harmful benchmarks while preserving helpfulness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00539","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GNMR: Runtime Stability Control for Low-Precision Large Language Model Training","primary_cat":"cs.LG","submitted_at":"2026-05-30T05:11:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GNMR is a gradient-norm-based controller that maps local stability signals to budgeted recovery actions to stabilize low-precision LLM training while preserving quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00494","ref_index":74,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ProjQ: Project-and-Quantize for Adapter-Aware LLM Compression","primary_cat":"cs.LG","submitted_at":"2026-05-30T02:54:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ProjQ constrains post-training quantization noise to a low-rank manifold through orthogonal subspace projection, enabling better compensation by LoRA adapters and preserving greater model plasticity than standard PTQ.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00405","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Talking Words to Sharing Thoughts: Scalable Multi-LLM Aggregation via Structured Message Passing","primary_cat":"cs.GT","submitted_at":"2026-05-29T22:47:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A bipartite factor graph with message-passing protocol and asymmetric damping aggregates multi-LLM predictions, cutting token use by 97% and API calls by 6X while outperforming baselines on MMLU, MMLU-Pro, GPQA, and MedMCQA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31268","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mellum2 Technical Report","primary_cat":"cs.CL","submitted_at":"2026-05-29T13:01:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Mellum 2 is a 12B MoE model with 2.5B active parameters, trained on 10.6T tokens with MoE, GQA, SWA, and MTP, then post-trained into Instruct and Thinking variants, claimed competitive with 4B-14B models at 2.5B compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31086","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Static Dialogues: Benchmarking Realistic, Heterogeneous, and Evolving Long-Term Memory","primary_cat":"cs.CL","submitted_at":"2026-05-29T09:51:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RHELM is a benchmark for LLM long-term memory with dynamic profiles, heterogeneous sources, and 27 memory characteristics that reveals weaknesses in existing models for multi-source aggregation and contextual reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07604","ref_index":39,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Contribution Weights: A Geometrical Analysis of Self-Attention Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-29T09:40:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Contribution Weights combine attention, value magnitude, and directional alignment to measure token influence more faithfully than attention alone, and show attention sinks actively suppress information via a convex sink-rate to output-norm relationship.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07603","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MetaEvo: A Meta-Optimization Framework for Experience-Driven Agent Evolution","primary_cat":"cs.LG","submitted_at":"2026-05-29T09:31:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MetaEvo is a two-stage framework using preference optimization for principle abstraction followed by modular reuse to enable continual improvement of LLM agents on reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31035","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MixFP4: Enhancing NVFP4 with Adaptive FP4/INT4 Block Representations","primary_cat":"cs.AR","submitted_at":"2026-05-29T09:05:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MixFP4 extends NVFP4 by adaptively selecting between two FP4 micro-formats per block using repurposed scale sign bits and a unified E2M2 compute path, claiming better accuracy than standard NVFP4 at 3.1% area and 1.5% power overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30844","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fine-Tuning Improves Information Conveyance in Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-29T05:05:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fine-tuning reorganizes uncertainty in LLMs into more efficient information conveyance, as shown by stronger length-entropy correlations and a tripling of entropy-semantic diversity links after controls.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30832","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SLAT: Segment-Level Adaptive Trimming for Efficient CoT Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-29T04:37:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SLAT applies segment-level adaptive trimming in RL to reduce CoT reasoning length by 50% while maintaining competitive accuracy on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30675","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Human-Alignment, Calibration, and Activation Patterns in Large Language Model Uncertainty","primary_cat":"cs.CL","submitted_at":"2026-05-29T00:08:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Examines uncertainty alignment with humans in LLM behavior and activations, its co-occurrence with calibration on multiple-choice and open-ended factual tasks, and effects of instruct fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00135","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On Effectiveness and Efficiency of Agentic Tool-calling and RL Training","primary_cat":"cs.LG","submitted_at":"2026-05-28T22:21:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Tool-calling evaluations for LLM agents are highly sensitive to implementation details such as random seeds and history handling, and two new techniques accelerate RL training with wall-clock speedup and no performance degradation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30529","ref_index":76,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Generalistic or Specific Embeddings, Which is Better? An Empirical Study on Search for Clinical Coding in Non-English Languages","primary_cat":"cs.CL","submitted_at":"2026-05-28T20:06:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Fine-tuning a Spanish biomedical encoder on Gemini-generated synthetic data for multiple languages yields a bi-encoder that matches or exceeds BioBERT-ST on clinical code retrieval metrics, with further gains from cross-encoder reranking on most languages.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29874","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Evolutionary Dynamics of Cooperation in Next-Generation LLM Agent Systems: A Cross-Provider Empirical Extension","primary_cat":"cs.MA","submitted_at":"2026-05-28T12:58:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Empirical tests on four new frontier LLMs show cooperative equilibria favored in most balanced conditions, with provider identity correlating more strongly with outcomes than model generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29708","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Understanding Safety-Sensitive Expert Behavior in Mixture-of-Experts LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-28T10:09:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Safety enforcement in aligned MoE LLMs is localized to specific experts and can be altered independently of the model's topic-driven routing patterns via a new red-teaming method called RASET.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29582","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PEARL: Training Socratic Tutors with Pedagogically Aligned Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-28T08:25:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PEARL is a pedagogically aligned RL framework using a controllable student simulator, generative reward model, and stable multi-objective scheme to train Socratic tutors that outperform other open-source models on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29579","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReactBench: A Cause-Driven Benchmark for Multimodal Hallucination via Systematic Evaluation","primary_cat":"cs.CV","submitted_at":"2026-05-28T08:23:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ReactBench is a new benchmark with four cause-targeted tasks that uses adversarial images, hallucination-inducing queries, and Chain-of-Thought analysis to expose specific failure modes in current multimodal large language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29523","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"K-FinHallu: A Hallucination Detection Benchmark for Multi-Turn RAG in Korean Finance","primary_cat":"cs.LG","submitted_at":"2026-05-28T07:40:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"K-FinHallu is the first multi-turn Korean financial RAG hallucination benchmark; frontier LLMs struggle especially on justified abstention while an 8B fine-tuned model reaches competitive performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29350","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ConMoE: Expert-Pool Consolidation via Prototype Reassignment for MoE Compression","primary_cat":"cs.AI","submitted_at":"2026-05-28T04:44:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ConMoE consolidates MoE experts into a smaller prototype pool via deterministic remapping based on contribution and replaceability, matching or beating pruning/merging baselines at 25-50% reduction on three models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28095","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SiDP: Memory-Efficient Data Parallelism for Offline LLM Inference","primary_cat":"cs.DC","submitted_at":"2026-05-27T07:52:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SiDP distributes model weights across a DP group with WaS and CaS modes to increase KV cache capacity by up to 1.8x and end-to-end throughput by up to 1.5x over vLLM on H20/H200/B200 GPUs for offline LLM inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23651","ref_index":75,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How Human-Like Are Large Language Models? A Register-Aware Linguistic Evaluation Framework","primary_cat":"cs.CL","submitted_at":"2026-05-22T14:04:25+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23171","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Understanding and Improving Noisy Embedding Techniques in Instruction Finetuning","primary_cat":"cs.LG","submitted_at":"2026-05-22T02:43:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SymNoise applies symmetric noise to embeddings during instruction fine-tuning and reports 6.7% higher AlpacaEval scores than NEFTune on LLaMA-2-7B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23078","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GEMQ: Global Expert-Level Mixed-Precision Quantization for MoE LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-21T22:23:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GEMQ applies global LP-based expert importance estimation and router fine-tuning within progressive quantization to cut memory and speed inference in MoE LLMs with little accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23057","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ModeSwitch-LLM: A Lightweight Phase-Aware Controller for Cross-Mode LLM Inference on a Single GPU","primary_cat":"cs.LG","submitted_at":"2026-05-21T21:46:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A rule-based controller selects among FP16, quantized, speculative, and hybrid modes for single-GPU LLM inference, delivering 2.1x latency speedup and 51.7% lower energy per token with near-baseline accuracy on Llama-3.1-8B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23019","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PACE: Two-Timescale Self-Evolution for Small Language Model Agents","primary_cat":"cs.LG","submitted_at":"2026-05-21T20:42:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PACE coordinates low-risk prompt evolution with validated higher-risk control-logic updates to improve frozen SLM agents on benchmarks without model retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22734","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ChronoMedKG: A Temporally-Grounded Biomedical Knowledge Graph and Benchmark for Clinical Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-21T17:04:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ChronoMedKG builds a temporal biomedical KG with 460k evidence-linked triples across 13k diseases using LLM consensus and introduces the ChronoTQA benchmark showing RAG gains on time-sensitive questions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22731","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Post-Training is About States, Not Tokens: A State Distribution View of SFT, RL, and On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-21T17:03:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A state distribution view of post-training shows that on-policy supervision from the learner itself can outperform fixed-dataset SFT and preserve retention better than aggressive supervised updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22675","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Policy Distillation via Capability-Selective Subspace Projection","primary_cat":"cs.CL","submitted_at":"2026-05-21T16:18:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Self-Policy Distillation extracts a capability subspace from model gradients on correctness tokens, projects KV activations into it for self-generation, and fine-tunes LLMs to achieve up to 13-16% gains over baselines without external signals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22636","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Multi-Source Framework for Relational Validation of Large Language Models Using Expert-Curated Encyclopedic Sources","primary_cat":"cs.SI","submitted_at":"2026-05-21T15:41:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The multi-source framework identifies a consistent relational deficit in LLMs, where they recognize domain concepts but fail to reproduce their relational structures when compared to expert encyclopedias across fields like sociology and philosophy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22287","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SciCore-Mol: Augmenting Large Language Models with Pluggable Molecular Cognition Modules","primary_cat":"cs.AI","submitted_at":"2026-05-21T10:37:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SciCore-Mol augments LLMs with three integrated modules for molecular perception, latent diffusion generation, and reaction reasoning, claiming an 8B open model competes with or exceeds proprietary systems on chemical tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22238","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Evaluating Large Language Models as Live Strategic Agents: Provider Performance, Hybrid Decomposition, and Operational Gaps in Timed Risk Play","primary_cat":"cs.AI","submitted_at":"2026-05-21T09:41:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Gemini-3.1-pro-preview won 20 of 32 Risk games through superior objective tracking and execution conversion, while a hybrid test with fixed execution showed near-equal planner performance across providers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21699","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"X-Token: Projection-Guided Cross-Tokenizer Knowledge Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-20T19:59:31+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"X-Token proposes projection-guided P-KL and H-KL losses to fix uncommon-token suppression and over-conservative matching in logit-based cross-tokenizer distillation, yielding gains over GOLD on Llama-3.2-1B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21177","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ChunkFT: Byte-Streamed Optimization for Memory-Efficient Full Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-20T13:44:44+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ChunkFT enables full-parameter fine-tuning of Llama 3-8B on one 24 GB GPU and Llama 3-70B on two 80 GB GPUs by streaming gradients over dynamically activated sub-tensors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20722","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AGPO: Adaptive Group Policy Optimization with Dual Statistical Feedback","primary_cat":"cs.LG","submitted_at":"2026-05-20T05:20:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AGPO adaptively sets trust-region size and exploration temperature from group reward dispersion, entropy, and KL drift, yielding higher scores than PPO and GRPO on nine math benchmarks under fixed token budget.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20520","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Open-World Evaluations for Measuring Frontier AI Capabilities","primary_cat":"cs.AI","submitted_at":"2026-05-19T21:42:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Open-world evaluations using qualitative review of real-world tasks can give earlier warnings of frontier AI capabilities than automated benchmarks, as demonstrated by an AI agent publishing a simple iOS app with one minor human fix.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22864","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reading Calibrated Uncertainty from Language Model Trajectories","primary_cat":"cs.LG","submitted_at":"2026-05-19T19:24:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Geometric features from per-layer MLP update trajectories fed to a sparse linear probe outperform maximum softmax probability for uncertainty quantification under selective abstention, with gains up to 21 AURC points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20005","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fine-Tuning Without Forgetting via Loss-Adaptive Learning Rates","primary_cat":"cs.LG","submitted_at":"2026-05-19T15:36:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FINCH is a loss-adaptive learning-rate schedule that reduces forgetting by 93% on average during LLM fine-tuning while matching standard task performance across several benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19377","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Evaluation Game: Beyond Static LLM Benchmarking","primary_cat":"cs.LG","submitted_at":"2026-05-19T05:22:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Presents a game-theoretic model with group actions for data augmentation in LLM adversarial evaluation, demonstrating local generalization from fine-tuning on three model families and redefining benchmarks as orbits under group actions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19344","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Retrieval-Augmented Linguistic Calibration","primary_cat":"cs.CL","submitted_at":"2026-05-19T04:31:38+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19276","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OpenCompass: A Universal Evaluation Platform for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-19T02:50:11+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19193","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sequential Consensus for Multi-Agent LLM Debates: A Wald-SPRT compute governor with calibration-based failure detection","primary_cat":"cs.LG","submitted_at":"2026-05-18T23:43:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Adapts SPRT as a compute governor for multi-agent LLM debates using Beta-modeled consensus scores from an LLM judge, yielding 3.7x call reduction on GSM8K at -2pp accuracy versus fixed rounds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}