{"total":32,"items":[{"citing_arxiv_id":"2605.26738","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"KARMA: Karma-Aligned Reward Model Adaptation","primary_cat":"cs.CL","submitted_at":"2026-05-26T09:12:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"KARMA adapts reward models from Reddit karma data to align LLMs with conversational pragmatics, finding that context-only rewards outperform karma-predictive ones downstream while reducing factuality across conditions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23244","ref_index":74,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Convex Optimization for Alignment and Preference Learning on a Single GPU","primary_cat":"cs.LG","submitted_at":"2026-05-22T05:25:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"COALA applies convex optimization reformulations of neural networks to direct preference optimization, claiming single-GPU training with ~18% of DPO's TFLOPs and competitive performance on multiple datasets and models up to 8B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21602","ref_index":94,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Benchmarking and Improving Monitors for Out-Of-Distribution Alignment Failure in LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-20T18:08:21+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15300","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deep Pre-Alignment for VLMs","primary_cat":"cs.CV","submitted_at":"2026-05-14T18:14:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Deep Pre-Alignment uses a small VLM perceiver instead of ViT to pre-align visual features with LLM text space, yielding 1.9-3.0 point gains on multimodal benchmarks and 32.9% less language forgetting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11857","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond Parameter Aggregation: Semantic Consensus for Federated Fine-Tuning of LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-12T09:41:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Semantic consensus on model outputs for public prompts enables federated LLM fine-tuning that matches parameter-aggregation baselines with orders-of-magnitude lower communication.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10933","ref_index":100,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DECO: Sparse Mixture-of-Experts with Dense-Comparable Performance on End-Side Devices","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:58:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DECO is a sparse MoE architecture with ReLU-based routing, learnable expert scaling, and NormSiLU activation that matches dense Transformer performance at 20% expert activation and delivers 2.93x speedup on Jetson AGX Orin.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09391","ref_index":17,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Do Linear Probes Generalize Better in Persona Coordinates?","primary_cat":"cs.AI","submitted_at":"2026-05-10T07:38:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Persona axes derived from contrastive prompts and PCA yield linear probes that generalize better than raw-activation probes across 10 datasets for deception and sycophancy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27861","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TwinGate: Stateful Defense against Decompositional Jailbreaks in Untraceable Traffic via Asymmetric Contrastive Learning","primary_cat":"cs.CR","submitted_at":"2026-04-30T13:44:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TwinGate deploys a stateful dual-encoder system with asymmetric contrastive learning to detect decompositional jailbreaks in untraceable LLM traffic at high recall and low false-positive rate with negligible latency.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"the intent level, ensuring that all fragments derived from the same intent reside within the same split, and that the model is evaluated exclusively on intents unseen during training. Unfragmented Benign Requests.To model the broad distribution of safe user interactions, we curate benign queries sourced from five high-quality instruction datasets: CodeAlpaca [2], Dolly [3], LMSYS-Chat-1M [37], UltraChat [4], and WizardLM [30]. Our curation pipeline prioritizes both safety and semantic dis- tinctness. We first exclude all instructions flagged as potentially unsafe in their source datasets. The remaining candidates are then subjected to an additional safety audit via Llama-3-8B-Guard [12], and any query failing the guardrail check is discarded. To reduce"},{"citing_arxiv_id":"2604.25699","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"NVLLM: A 3D NAND-Centric Architecture Enabling Edge on-Device LLM Inference","primary_cat":"cs.AR","submitted_at":"2026-04-28T14:26:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NVLLM offloads FFN computations to integrated 3D NAND flash with page-level access and keeps attention in DRAM, delivering 16.7x-37.9x speedups over GPU out-of-core baselines for models up to 30B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24334","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reducing Redundancy in Retrieval-Augmented Generation through Chunk Filtering","primary_cat":"cs.CL","submitted_at":"2026-04-27T11:23:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Entity-based chunk filtering reduces RAG vector index size by 25-36% with retrieval quality near baseline levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20933","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"IRIS: Interpolative R\\'enyi Iterative Self-play for Large Language Model Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-04-22T11:52:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"IRIS unifies self-play fine-tuning under an interpolative Rényi objective with adaptive alpha scheduling and reports better benchmark scores than baselines while surpassing full supervised fine-tuning with only 13% of the annotated data.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Large language models (LLMs) have achieved remarkable capabilities through post-training align- ment with human preferences[47, 68, 13, 61, 52, 51, 4, 28]. The prevailing approaches, including reinforcement learning from human feedback (RLHF)[12, 5, 83, 6] and direct preference optimization (DPO)[56, 69, 80], rely on curated preference datasets that are expensive to acquire and difficult to scale[20, 44, 39, 88]. This practical constraint motivates a natural question: can an LLM continue to improve beyond the ceiling of supervised fine-tuning (SFT)[52, 71] without any additional human- annotated data? Self-play fine-tuning (SPIN)[11] provides an affirmative answer. Drawing inspiration from the self-play mechanism in AlphaGo Zero[63], SPIN frames fine-tuning as a two-player game"},{"citing_arxiv_id":"2605.05227","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rethinking Data Curation in LLM Training: Online Reweighting Offers Better Generalization than Offline Methods","primary_cat":"cs.LG","submitted_at":"2026-04-19T14:23:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ADAPT is an online reweighting framework for LLM training that outperforms offline data selection and mixing methods in cross-benchmark generalization under equal compute.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"cos 𝜙(𝑥), 𝜙(𝑣) \u0001.(10) Equivalently, we can instantiate a representation-based scorer 𝑠ADAPT(𝑣, 𝑖)=cos 𝜙(𝑣), 𝜙(𝑖) \u0001, where 𝜙(·)is the weighted hidden representation. We use a sigmoid function with temperature scaling to produceabsoluteweights that are independent of the batch composition: 𝑤 𝑡 (𝑖)=𝜎 \u0012 𝑠 𝐴𝐷 𝐴𝑃𝑇 (𝑥𝑖) max(𝜏, 𝜖) \u0013 = 1 1+exp(−𝑠 𝐴𝐷 𝐴𝑃𝑇 (𝑥𝑖)/max(𝜏, 𝜖)) (11) 6 Published as a conference paper at ICLR 2026 where 𝜎(·) denotes the sigmoid function, 𝜏 >0 is a temperature hyperparameter (default 𝜏=1.0 ), and 𝜖 >0 is a small constant (e.g., 𝜖=10 −8) used for numerical stability. The temperature parameter 𝜏 controls the steepness of the sigmoid: larger values produce flatter weight distributions, while smaller values create sharper distinctions between high and low similarity samples."},{"citing_arxiv_id":"2604.14682","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Acceptance Dynamics Across Cognitive Domains in Speculative Decoding","primary_cat":"cs.AI","submitted_at":"2026-04-16T06:38:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Empirical measurements across four NLP domains show task type is a stronger predictor of speculative decoding acceptance than tree depth, with chat uniquely achieving expected accepted length over 1 token per step.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12817","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Understanding and Improving Continuous Adversarial Training for LLMs via In-context Learning Theory","primary_cat":"cs.LG","submitted_at":"2026-04-14T14:43:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Continuous adversarial training in the embedding space produces a robust generalization bound for linear transformers that decreases with perturbation radius, tied to singular values of the embedding matrix, and motivates a new regularizer that improves real LLM jailbreak robustness-utility tradeoff","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"wτ ∼ N(0, I d0). We assume that each ICL training pointxτ,i (1≤i≤N)and the query pointx τ,q are drawn fromx τ,i, xτ,q ∼ N(0,Λ)whereΛ∈R d0×d0 is the covariance matrix, and their labels arey τ,i =w ⊤ τ xτ,i andy τ,q =w ⊤ τ xτ,q. Then, the ICL inputZ τ specified by the taskτis given by Zτ := \u0012 xτ,1 · · ·x τ,N xτ,q yτ,1 · · ·y τ,N 0 \u0013 ∈R (d0+1)×(N+1) .(5) Other notations.We denote[n] :={1,· · ·, n}for anyn∈N +. For anyA∈R n×m, we denote ∥A∥2,∞ := max 1≤i≤m ∥Ai,:∥2,∥A∥ 2 be the operator norm, and∥A∥ F be the Frobenius norm. Besides,λ i(A),λ max(A), andλ min(A)denote itsi-th largest, largest, and smallest eigenvalues, whileσ i(A),σ max(A), andσ min(A)denote itsi-th largest, largest, and smallest singular values."},{"citing_arxiv_id":"2604.05868","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Understanding Performance Gap Between Parallel and Sequential Sampling in Large Reasoning Models","primary_cat":"cs.CL","submitted_at":"2026-04-07T13:28:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Lack of exploration from conditioning on prior answers is the primary reason parallel sampling outperforms sequential sampling in large reasoning models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.00778","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Is Preference Optimization Doing, and Why?","primary_cat":"cs.LG","submitted_at":"2025-11-30T08:27:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Gradient analysis and ablations show DPO and PPO have different target directions and component roles in preference optimization for LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.17408","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Impact of Off-Policy Training Data on Probe Generalisation","primary_cat":"cs.AI","submitted_at":"2025-11-21T17:08:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Off-policy training data for LLM behavior probes causes significant generalization failures especially for intent-based behaviors like deception, and performance on coerced incentivised data correlates with real on-policy success.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.06226","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"No Data? No Problem: Synthesizing Security Graphs for Better Intrusion Detection","primary_cat":"cs.CR","submitted_at":"2025-06-06T16:41:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PROVSYN synthesizes high-fidelity security provenance graphs via graph generation and LLMs to augment imbalanced datasets, improving downstream APT detection accuracy by up to 38% on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.02737","ref_index":167,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SmolLM2: When Smol Goes Big -- Data-Centric Training of a Small Language Model","primary_cat":"cs.CL","submitted_at":"2025-02-04T21:43:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SmolLM2 is a 1.7B-parameter language model that outperforms Qwen2.5-1.5B and Llama3.2-1B after overtraining on 11 trillion tokens using custom FineMath, Stack-Edu, and SmolTalk datasets in a multi-stage pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.09686","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models","primary_cat":"cs.AI","submitted_at":"2025-01-16T17:37:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The paper surveys reinforced reasoning techniques for LLMs, covering automated data construction, learning-to-reason methods, and test-time scaling as steps toward Large Reasoning Models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Bert: Pre-training of deep bidirectional transformers for language understand- ing. arXiv preprint arXiv:1810.04805, 2018. [31] Bosheng Ding, Chengwei Qin, Ruochen Zhao, Tianze Luo, Xinze Li, Guizhen Chen, Wen- han Xia, Junjie Hu, Anh Tuan Luu, and Shafiq Joty. Data augmentation using llms: Data perspectives, learning paradigms and challenges. arXiv preprint arXiv:2403.02990, 2024. [32] Ning Ding, Yulin Chen, Bokai Xu, Yujia Qin, Zhi Zheng, Shengding Hu, Zhiyuan Liu, Maosong Sun, and Bowen Zhou. Enhancing chat language models by scaling high-quality instructional conversations. arXiv preprint arXiv:2305.14233, 2023. [33] Qingxiu Dong, Lei Li, Damai Dai, Ce Zheng, Zhiyong Wu, Baobao Chang, Xu Sun, Jingjing Xu, and Zhifang Sui. A survey on in-context learning."},{"citing_arxiv_id":"2412.05579","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods","primary_cat":"cs.CL","submitted_at":"2024-12-07T08:07:24+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A survey that organizes LLMs-as-judges research into functionality, methodology, applications, meta-evaluation, and limitations.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"5)HANNA [34], MANS [73], OpenMEVA [73], StoryER [26], PERSER [229] Values Alignment(§6.1.6) PKU-SafeRLHF [96], HHH [6], CVALUES [255] Recommendation(§6.1.7) MovieLens [80], Zhang et al. [284], Yelp [4] Search (§6.1.8) TREC Deep Learning Track [118], MS MARCO v2 collection [11], LeCaRDv2 [129] Comprehensive Data(§6.1.9) HelpSteer [238], HelpSteer2 [237], UltraFeedback [44], UltraChat [49], ShareGPT [37], TruthfulQA [140], AlpacaEval [56],Chatbot Arena [292], MT-Bench [292], WildBench [138], FLASK [269], RewardBench [116], RM-Bench [148], JudgeBench [213],MLLM-as-a-Judge [24], MM-Eval [202] Metric (§6.2) Accuracy, Pearson [41], Spearman [190], Kendall's Tau [191], Cohen's Kappa [240], ICC [13] Fig. 1. Taxonomy of LLMs-as-judges in functionality, methodology, application, meta-evaluation."},{"citing_arxiv_id":"2412.05271","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","primary_cat":"cs.CV","submitted_at":"2024-12-06T18:57:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InternVL 2.5 is the first open-source MLLM to surpass 70% on the MMMU benchmark via model, data, and test-time scaling, with a 3.7-point gain from chain-of-thought reasoning.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"VideoChat2-IT (en & zh) [130, 131], EgoTaskQA (en) [99], NTU RGB+D (en) [152], CLEVRER (en) [276], LLaV A-Video (en) [307], FineVideo (en) [67], PerceptionTest (en) [193], HiREST (en) [291], STAR (en) [259],General QA EgoSchema (en) [175], ScanQA (en) [10], LSMDC (en) [201] GUI GUI-World (en) [24] Type: Text Datasets UltraFeedback (en) [48], UltraChat (en) [58], Unnatural-Instructions (en) [90], NoRobots (en) [196], MOSS (en) [221], LIMA (en) [314], SlimOrca (en) [142], WizardLM-Evol-Instruct-70K (en) [265], Llama-3-Magpie-Pro (en) [266], Magpie-Qwen2-Pro (en & zh) [266], KOpen-HQ-Hermes-2.5-60K (ko) [179], Firefly (zh) [270], Dolly (en) [44], OpenAI-Summarize-TLDR (en) [21], Know-Saraswati-CoT (en) [114],"},{"citing_arxiv_id":"2410.10813","ref_index":66,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LongMemEval: Benchmarking Chat Assistants on Long-Term Interactive Memory","primary_cat":"cs.CL","submitted_at":"2024-10-14T17:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LongMemEval benchmarks long-term memory in chat assistants, revealing 30% accuracy drops across sustained interactions and proposing indexing-retrieval-reading optimizations that boost performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.10781","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Attention Sink Emerges in Language Models: An Empirical View","primary_cat":"cs.CL","submitted_at":"2024-10-14T17:50:28+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Attention sinks emerge in language models from softmax-induced token dependence on attention scores and do not appear when using sigmoid attention without normalization in models up to 1B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.01800","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MiniCPM-V: A GPT-4V Level MLLM on Your Phone","primary_cat":"cs.CV","submitted_at":"2024-08-03T15:02:21+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MiniCPM-Llama3-V 2.5 delivers GPT-4V-level multimodal performance on phones through architecture, pretraining, and alignment optimizations.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Kleister Charity [93], WikiTableQuestions [79], Real-CQA [3], AI2D [49], etc. Chat FSVQA [90], Visual-Dialog [28] 780K Part-2 Part-1 sample from Part-1 data 400K OCR DocVQA, TextVQA, OCR-VQA, VisualMRC, ChartQA, AI2D690KArxivQA [56], LLaV AR [118], TextOCR-GPT4V [16], etc. Instruct SVIT [119], LLaV A-Instruct-150K [62], UniMM-Chat [110], ShareGPT4V [21]1.9MLVIS [36], ALLaV A [18] Text-Only Ultra-Chat [30], Alpaca [97], ShareGPT [120], BELLE [10] -OpenOrca [58], OpenHermes [98], In-House-MiniCPM-SFT Multilingual Generalization. Multimodal capability across multiple languages is essential for serving users from broader communities. Traditional solutions involve extensive multimodal data collection and cleaning, and training for the target languages. Fortunately, recent findings from"},{"citing_arxiv_id":"2406.08464","ref_index":104,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing","primary_cat":"cs.CL","submitted_at":"2024-06-12T17:52:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Magpie synthesizes 300K high-quality alignment instructions from Llama-3-Instruct via auto-regressive prompting on partial templates, enabling fine-tuned models to match official instruct performance on AlpacaEval, ArenaHard, and WildBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.14469","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SnapKV: LLM Knows What You are Looking for Before Generation","primary_cat":"cs.CL","submitted_at":"2024-04-22T17:42:58+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SnapKV selects clustered important KV positions per attention head from an observation window at the prompt end, yielding 3.6x faster generation and 8.2x better memory efficiency on 16K-token inputs with comparable performance across 16 datasets.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Although these techniques may reduce the KV cache size during generation, they do not address the primary challenges of understanding complex prompt contexts, leaving critical issues unresolved. 3 Observations In this section, we present our observations regarding the attention allocation patterns in the Query- Key matrix during token generation. Our analysis utilizes samples from Ultrachat [11], a multi-turns, high-quality instruction dataset consisting of 1.4 million dialogues. We further filter the sequences with response length greater than 512 and prompt length greater than 3k. Our findings are concluded into two key observations as follows: • Pattern can be identified before generation. In this experiment, we split the attention features"},{"citing_arxiv_id":"2403.07691","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ORPO: Monolithic Preference Optimization without Reference Model","primary_cat":"cs.CL","submitted_at":"2024-03-12T14:34:08+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"ORPO performs preference alignment during supervised fine-tuning via a monolithic odds ratio penalty, allowing 7B models to outperform larger state-of-the-art models on alignment benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"generated per input, we sample the first item for each input and examine their inter cosine similarity with Equation 15 for across-input diversity. Un- like per-input diversity, it is noteworthy that Phi-2 (ORPO) has lower average cosine similarity in the second row of Table 4. We can infer that ORPO triggers the model to generate more instruction- specific responses than DPO. AIDD(θ) = D N[ i=1 Oi,θ,j=1 ! (15) Per Input↓ Across Input↓ Phi-2 + SFT + DPO 0.8012 0.6019 Phi-2 + ORPO 0.8909 0.5173 Llama-2 + SFT + DPO 0.8889 0.5658 Llama-2 + ORPO 0.9008 0.5091 Table 4: Lexical diversity of Phi-2 and Llama-2 fine- tuned with DPO and ORPO. Lower cosine similarity is equivalent to higher diversity. The highest value in each column within the same model family is bolded. 7 Discussion In this section, we expound on the theoretical and computational details of ORPO. The theoretical anal- ysis of ORPO is studied in Section 7.1, which will be supported with the empirical analysis in Section 7.2. Then, we compare the computational load of DPO and ORPO in Section 7.3. 7.1 Comparison to Probability Ratio The rationale for selecting the odds ratio instead of the probability ratio lies in its stability. The prob- ability ratio for generating the favored response yw over the disfavored response yl given an input sequence x can be defined as Equation 16. PRθ(yw, yl) = Pθ(yw|x) Pθ(yl|x) (16) While this formulation has been used in previous preference alignment methods that precede SFT (Rafailov et al., 2023; Azar et al., 2023), the odds ratio is a better choice in the setting where the preference alignment is incorporated in SFT as the odds ratio is more sensitive to the model's prefer- ence understanding. In other words, the probability ratio leads to more extreme discrimination of the disfavored responses than the odds ratio. We visualize this through the sample distribu- tions of the log probability ratio log PR(X2|X1) and log odds ratio log OR(X2|X1). We sample 50,000 samp"},{"citing_arxiv_id":"2403.04652","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Yi: Open Foundation Models by 01.AI","primary_cat":"cs.CL","submitted_at":"2024-03-07T16:52:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Yi models are 6B and 34B open foundation models pretrained on 3.1T curated tokens that achieve strong benchmark results through data quality and targeted extensions like long context and vision alignment.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"carefully balanced the distribution of instructions across various tags. This approach ensures a diverse finetuning dataset, aiming to achieve enhanced cross-task robustness. To achieve the optimal data ratio for balancing different directions of the capability, we use an approximate grid search to determine our data mixture. Motivated by Dong et al. [20], this process involved experimenting with {1, 1/2, 1/4, 1/8, 1/16, 1/32, 1/64} proportions for each ability. The search process was guided by validation results and our in-house human evaluation sets. ChatML Format Beyond the focus on data quality and diversity, our observations revealed that the format of the data substantially influences the model's ultimate performance."},{"citing_arxiv_id":"2402.08268","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"World Model on Million-Length Video And Language With Blockwise RingAttention","primary_cat":"cs.LG","submitted_at":"2024-02-13T07:47:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Presents open-source 7B models for million-token video and language understanding via Blockwise RingAttention, setting new benchmarks in retrieval and long video tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.10020","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Self-Rewarding Language Models","primary_cat":"cs.CL","submitted_at":"2024-01-18T14:43:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Iterative self-rewarding via LLM-as-Judge in DPO training on Llama 2 70B improves instruction following and self-evaluation, outperforming GPT-4 on AlpacaEval 2.0.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.16944","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Zephyr: Direct Distillation of LM Alignment","primary_cat":"cs.LG","submitted_at":"2023-10-25T19:25:16+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Zephyr-7B achieves state-of-the-art chat benchmark results among 7B models by distilling alignment via dDPO on AI feedback preferences, surpassing the 70B Llama-2-Chat model on MT-Bench with no human data required.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}