{"total":435,"items":[{"citing_arxiv_id":"2606.27559","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Sensitivity-Aware Test Collection for Search Among Personal Information","primary_cat":"cs.IR","submitted_at":"2026-06-25T21:24:17+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new sensitivity-labeled test collection is released from Enron emails with crowdsourced queries, relevance judgments, and LLM extensions for evaluating sensitivity-aware search.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10357","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Atomic Intent Reasoning: Bringing LLM Semantics to Industrial Cross-Domain Recommendations","primary_cat":"cs.IR","submitted_at":"2026-06-09T03:13:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"AIR framework achieves ~400x faster LLM-based cross-domain recommendation via offline intent construction and online retrieval, with SOTA results on public data and +3.446% GMV lift in live A/B tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01414","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agent Skills Should Go Beyond Text: The Case for Visual Skills","primary_cat":"cs.CV","submitted_at":"2026-05-31T19:22:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper proposes that reusable agent skills should incorporate visual elements alongside text, introduces three forms of visual skills and an automatic conversion system, and reports better performance on GUI and visual-centric tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00888","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Memory-Efficient LLM Training with Dynamic Sparsity: From Stability to Practical Scaling","primary_cat":"cs.LG","submitted_at":"2026-05-30T20:47:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SMET stabilizes dynamic sparse training of LLMs via optimizer warm-up and density-aware scaling, reducing memory use and enabling practical sparse pre-training as an alternative to dense methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00531","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"State Machine Guided Multi-Relational Synthetic Data from Logs for Anomaly Detection","primary_cat":"cs.MA","submitted_at":"2026-05-30T04:49:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A framework extracts a latent state machine from logs, induces a multi-table relational schema, and uses it as a generative prior to create synthetic data that augments real logs for better anomaly detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00511","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Saliency-Aware Model Merging","primary_cat":"cs.LG","submitted_at":"2026-05-30T04:00:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SA-Merging extends SynFlow-style saliency to task vectors, adds merge-aware modulation and iterative pruning, and applies rank-wise decomposition to LoRAs, narrowing the gap to test-time adaptation on vision and language tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00494","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ProjQ: Project-and-Quantize for Adapter-Aware LLM Compression","primary_cat":"cs.LG","submitted_at":"2026-05-30T02:54:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ProjQ constrains post-training quantization noise to a low-rank manifold through orthogonal subspace projection, enabling better compensation by LoRA adapters and preserving greater model plasticity than standard PTQ.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07602","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sample-Efficient Post-Training for LEGO Spatial-Physics Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-29T09:31:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PVPO is a sample-efficient RL method that improves semantic, geometric, and physical quality in LLM LEGO assembly generation by mitigating the PhysHack failure mode where validity alone fails to ensure fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31040","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniRTL: Unifying Code and Graph for Robust RTL Representation Learning","primary_cat":"cs.LG","submitted_at":"2026-05-29T09:18:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniRTL unifies RTL code and CDFG through mutual masked modeling and hierarchical training with a graph-aware tokenizer, outperforming prior single-modality methods on performance prediction and code retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30876","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"dMoE: dLLMs with Learnable Block Experts","primary_cat":"cs.CL","submitted_at":"2026-05-29T06:03:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"dMoE aggregates token expert distributions to block level in dLLMs, cutting unique experts from 69.5 to 14.6, memory by 76-80%, and latency by 1.14-1.66x while retaining 99.11% performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30784","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Text-guided Feature Disentanglement for Cross-modal Gait Recognition","primary_cat":"cs.CV","submitted_at":"2026-05-29T03:16:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TCFDNet uses a Gait Modality Text Dictionary from LLMs, CLIP alignment, and text-guided disentanglement modules to achieve SOTA cross-modal gait recognition on SUSTech1K and FreeGait.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11232","ref_index":67,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Every Act Has Its Price: Compressed Moral Composition in Frontier LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-29T02:36:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Moral Trolley Arena shows frontier LLMs produce composite moral preferences that are compressed rather than additive functions of calibrated component act strengths across Moral Foundations Theory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30537","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Long-Term Effects of Data Selection in LLM Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-28T20:12:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Short-term data selectors in multi-stage LLM fine-tuning can slow future learning and increase forgetting, formalized as myopic selection with a proposed LHAS objective to address it.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30140","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AnomalyAgent: Training-Free Agentic Models for Zero-/Few-Shot Anomaly Detection","primary_cat":"cs.CV","submitted_at":"2026-05-28T16:05:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AnomalyAgent is a training-free agentic framework that equips MLLMs with anomaly-centric tools and a memory module to outperform VLM-based methods on both simple and complex contextual anomalies in zero- and few-shot settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30126","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PARCEL: Pool-Anchored Resampling with Conditioned Elastic Queries for Efficient Vision-Language Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-28T15:57:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PARCEL is a new visual tokenization architecture combining pool-anchored resampling with conditioned elastic queries to enhance performance-efficiency tradeoffs in LVLMs over prior matryoshka methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30014","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From GPS Points to Travel Patterns: Flexible and Semantic Trajectory Generation with LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-28T14:39:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HTP hierarchically generates travel patterns via RQ-VAE tokenization then uses SFT-tuned LLMs to produce conditioned trajectory sequences, outperforming baselines by 29.78% on two datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29667","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond English and Evasion: A Human-Annotated Multi-Domain Benchmark for High-Stakes LLM Safety Evaluation in Chinese","primary_cat":"cs.CL","submitted_at":"2026-05-28T09:28:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces ChiSafe-PAS, a 1,897-prompt human-annotated Chinese adversarial benchmark for LLM safety with 3-class labels, 9-category obfuscation taxonomy, and domain coverage in self-harm, drugs, fraud, and satire.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29657","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OccamToken: Efficient VLM Inference with Training-Free and Budget-Adaptive Token Pruning","primary_cat":"cs.CV","submitted_at":"2026-05-28T09:20:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OccamToken replaces absolute token ranking with register-anchored relative evidence testing to enable adaptive, high-ratio visual token pruning in VLMs while preserving most accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29639","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RTP-LLM: High-Performance Alibaba LLM Inference Engine","primary_cat":"cs.OS","submitted_at":"2026-05-28T09:07:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RTP-LLM is a new LLM inference engine achieving 4.7x-6.3x model loading speedup and 1.12x-2.52x throughput gains over vLLM and SGLang via disaggregated phases, multi-tier KV cache, and modular optimizations in production at Alibaba.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29613","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decoding Strategies for Diffusion-Based ASR: A Systematic Evaluation of Confidence-Based Thresholding","primary_cat":"eess.AS","submitted_at":"2026-05-28T08:48:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Threshold-based decoding for diffusion ASR outperforms fixed schemes by accepting high-confidence tokens early and matches autoregressive accuracy with better speed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29460","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FedSmoothLoRA: Toward Smoother and Faster Convergence in Federated Low-Rank Adaptation","primary_cat":"cs.CV","submitted_at":"2026-05-28T06:53:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FedSmoothLoRA improves federated LoRA fine-tuning by constructing local initializations from a round-matching matrix for cross-round continuity and a gradient-aligned matrix for client-specific guidance, yielding faster convergence than prior methods in image and text tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29421","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning Design Skills as Memory Policies for Agentic Photonic Inverse Design","primary_cat":"cs.CL","submitted_at":"2026-05-28T06:14:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SkillPCF is a closed-loop agent framework with a physics-guided memory skill bank, reinforcement-learned skill selection, and simulator-grounded evolution that improves design quality and efficiency for photonic crystal fiber inverse design under limited simulation budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29295","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvoGM: Learning to Merge LLMs via Evolutionary Generative Optimization","primary_cat":"cs.NE","submitted_at":"2026-05-28T03:22:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EvoGM uses a dual-generator architecture with cycle-consistent learning on winner-loser pairs from search history to optimize LLM merging coefficients inside a multi-round evolutionary pipeline and reports outperformance over baselines on seen and unseen tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23783","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Benchmarking LLMs for Community Governance Simulation with Life-history Narratives","primary_cat":"cs.CY","submitted_at":"2026-05-22T15:48:49+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23482","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multimodal Distribution Matching for Vision-Language Dataset Distillation","primary_cat":"cs.CV","submitted_at":"2026-05-22T10:41:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MDM distills vision-language datasets via joint embedding clustering, weight-space model interpolation, and geometry-aware distribution matching on the unit hypersphere.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23141","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VisAnalog: A Diagnostic Suite for Visual Concept Transfer on Natural Images","primary_cat":"cs.CV","submitted_at":"2026-05-22T01:43:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VisAnalog is a new controlled benchmark showing VLMs substantially underperform humans on visual concept transfer under one- to four-step deterministic transformations, with relation inference as the main failure mode.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22819","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cambrian-P: Pose-Grounded Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-21T17:59:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Cambrian-P adds per-frame camera pose tokens and a regression head to video MLLMs, delivering 4.5-6.5% gains on spatial benchmarks, generalization to other video QA tasks, and SOTA streaming pose estimation on ScanNet.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"InICCV, 2019. [3] Manuel López Antequera, Pau Gargallo, Markus Hofinger, Samuel Rota Bulo, Yubin Kuang, and Peter Kontschieder. Mapillary planet-scale depth dataset. InECCV, 2020. [4] Iro Armeni, Ozan Sener, Amir R Zamir, Helen Jiang, Ioannis Brilakis, Martin Fischer, and Silvio Savarese. 3d semantic parsing of large-scale indoor spaces. InCVPR, 2016. [5] Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, et al. Qwen technical report.arXiv preprint arXiv:2309.16609, 2023. [6] Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. Qwen-vl: A frontier large vision-language model with versatile abilities."},{"citing_arxiv_id":"2605.21338","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Text Analytics Evaluation Framework: A Case Study on LLMs and Social Media","primary_cat":"cs.CL","submitted_at":"2026-05-20T16:05:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Presents a new question-based evaluation framework for LLMs on aggregated social media text and reports that performance declines with input scale, task complexity, and numerical operations beyond 500 instances.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23719","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Weierstrass Positional Encoding for Vision Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-20T13:35:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WePE encodes 2D patch positions in Vision Transformers via Weierstrass elliptic functions on the complex plane to exploit double periodicity and derive relative positions algebraically.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21147","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SMoA: Spectrum Modulation Adapter for Parameter-Efficient Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-20T13:19:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SMoA is a new PEFT adapter that uses block-wise Hadamard-modulated low-rank branches on spectral partitions to cover more pretrained spectral directions than standard LoRA under a smaller parameter budget.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20950","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Focus-then-Context: Subject-Centric Progressive Visual Token Reduction for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-20T09:37:53+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPpruner reduces visual tokens in VLMs via focus identification followed by context-aware scanning, retaining 22.2% tokens for 2.53x speedup on Qwen2.5-VL with negligible accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20942","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bridging Structure and Language: Graph-Based Visual Reasoning for Autonomous Road Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-20T09:28:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A graph-grounded Combined Road Substrate framework generates traceable QA pairs from road maps to improve small VLMs on compositional road reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20369","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DEL: Digit Entropy Loss for Numerical Learning of Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-19T18:18:59+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DEL is a new loss for LLM numerical learning that applies supervised digit entropy optimization and extends to floating-point numbers, showing improved accuracy and distance metrics over prior methods on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20316","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FullFlow: Upgrading Text-to-Image Flow Matching Models for Bidirectional Vision--Language Generation","primary_cat":"cs.CV","submitted_at":"2026-05-19T17:59:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FullFlow adds LoRA adapters and discrete text insertion to pretrained rectified-flow text-to-image models, achieving bidirectional generation with major gains in FID, CIDEr, VRAM, and throughput over Dual Diffusion baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19929","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Breaking Modality Heterogeneity in Low-Bit Quantization for Large Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-19T14:49:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SplitQ improves low-bit PTQ for VLMs by isolating modality-specific outlier channels via MOCD and applying dual-branch adaptive calibration via ACC, outperforming prior methods on six datasets across W4A8 to W3A2 settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19568","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"m3BERT: A Modern, Multi-lingual, Matryoshka Bidirectional Encoder","primary_cat":"cs.CL","submitted_at":"2026-05-19T09:13:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"m3BERT uses a three-stage Matryoshka pretraining approach on a bidirectional encoder to support variable embedding sizes while outperforming prior models on large-scale retrieval tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19528","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Camera-Robust 3D Localization: Equation-Anchored Tool-Use for MLLMs","primary_cat":"cs.CV","submitted_at":"2026-05-19T08:30:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Proposes an equation-anchored tool-use method for MLLMs that writes the pinhole back-projection equation in Chain-of-Thought and substitutes retrieved camera intrinsics and depths to achieve robustness in 3D object detection and visual grounding under rescaled intrinsics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19514","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Position: The Turing-Completeness of Autoregressive Transformers Relies Heavily on Context Management","primary_cat":"cs.AI","submitted_at":"2026-05-19T08:12:33+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19342","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Semantic-Enriched Latent Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-19T04:29:33+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20266","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Survey of Large Audio Language Models: Generalization, Trustworthiness, and Outlook","primary_cat":"cs.SD","submitted_at":"2026-05-18T20:21:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A survey of Large Audio Language Models that establishes a taxonomy of trustworthiness vulnerabilities and proposes a Defense-in-Depth roadmap for audio intelligence.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Aleman, D. Almeida, J. Altenschmidt, S. Altman, S. Anadkat et al., \"Gpt-4 technical report,\"arXiv preprint arXiv:2303.08774, 2023. [3] H. Touvron, T. Lavril, G. Izacard, X. Martinet, M.-A. Lachaux, T. Lacroix, B. Rozi `ere, N. Goyal, E. Hambro, F. Azharet al., \"Llama: Open and efficient foundation language models,\"arXiv preprint arXiv:2302.13971, 2023. [4] J. Bai, S. Bai, Y. Chu, Z. Cui, K. Dang, X. Deng, Y. Fan, W. Ge, Y. Han, F. Huanget al., \"Qwen technical report,\"arXiv preprint arXiv:2309.16609, 2023. [5] A. Liu, B. Feng, B. Xue, B. Wang, B. Wu, C. Lu, C. Zhao, C. Deng, C. Zhang, C. Ruanet al., \"Deepseek-v3 technical report,\"arXiv preprint arXiv:2412.19437, 2024. [6] D. Guo, D. Yang, H. Zhang, J."},{"citing_arxiv_id":"2605.18601","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Incantation: Natural Language as the Action Interface for Multi-Entity Video World Models","primary_cat":"cs.CV","submitted_at":"2026-05-18T16:12:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Incantation is the first video world model to use per-frame natural language conditioning for simultaneous multi-entity control and concept-level cross-entity transfer in interactive video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18933","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Geometric Analysis of Sign-Magnitude Asymmetry in a ReLU + RMSNorm Block under Ternary Quantization","primary_cat":"cs.LG","submitted_at":"2026-05-18T15:36:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sign-flip perturbations produce π/(π-2) ≈ 2.75 times more transverse output energy than equal-norm sign-preserving perturbations in a ReLU + RMSNorm block because ReLU creates directional asymmetry that RMSNorm's transverse projection exposes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18475","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GAMMA: Global Bit Allocation for Mixed-Precision Models under Arbitrary Budgets","primary_cat":"cs.LG","submitted_at":"2026-05-18T14:30:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GAMMA is a post-training framework that learns stable module sensitivity rankings for mixed-precision LLM quantization and projects them to exact bit budgets via integer programming, enabling reuse across arbitrary memory targets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18233","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enhancing Train-Free Infinite-Frame Generation for Consistent Long Videos","primary_cat":"cs.CV","submitted_at":"2026-05-18T11:28:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MIGA introduces two-stage alignment to close train-inference gaps and dual consistency enhancement via self-reflection and long-range guidance to achieve SOTA temporal consistency in infinite-frame video generation on VBench and NarrLV.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18115","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WinTok: A Win-Win Hybrid Tokenizer via Decomposing Visual Understanding and Generation with Transferable Tokens","primary_cat":"cs.CV","submitted_at":"2026-05-18T09:24:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"WinTok is a hybrid visual tokenizer that supplements pixel tokens with learnable semantic tokens distilled asymmetrically from foundation models to improve reconstruction, understanding, and generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18066","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TIDAL: Recovering Temporal Phase for Cloud Block Storage Placement from LLM-Derived Semantics","primary_cat":"cs.OS","submitted_at":"2026-05-18T08:49:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TIDAL recovers temporal phase signals from LLM-derived semantics of provisioning metadata to enable complementary CVD placement, reducing overload frequency by 79.1% on production traces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18007","ref_index":121,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Semantic Reranking at Inference Time for Hard Examples in Rhetorical Role Labeling","primary_cat":"cs.CL","submitted_at":"2026-05-18T08:03:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RISE is an inference-time semantic reranking framework that refines low-confidence predictions in rhetorical role labeling using contrastively learned label representations, delivering an average +9.15 macro-F1 gain on hard examples across eight datasets and seven models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17967","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reconciling Contradictory Views on the Effectiveness of SFT in LLMs: An Interaction Perspective","primary_cat":"cs.AI","submitted_at":"2026-05-18T07:22:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SFT on LLMs removes noise-like token interactions in a brief early phase before introducing overfitted ones, explaining inconsistent effectiveness across model scales.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17954","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A More Word-like Image Tokenization for MLLMs","primary_cat":"cs.CV","submitted_at":"2026-05-18T07:09:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DiVT clusters patch embeddings into coherent semantic units and adapts token count to image complexity, matching or exceeding baselines with fewer visual tokens on multimodal benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17915","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SurgLQA: Scalable Long-Horizon Surgical Video Question Answering","primary_cat":"cs.CV","submitted_at":"2026-05-18T06:24:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SurgLQA introduces FTC for compact long-range video representations and TMS for adaptive test-time scaling, reporting gains on restructured Colon-LQA and REAL-Colon-VQA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}