{"total":10,"items":[{"citing_arxiv_id":"2605.23346","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Contrastive Distribution Matching for Amortized Sequential Monte Carlo in Discrete Diffusion","primary_cat":"cs.LG","submitted_at":"2026-05-22T08:06:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CDM amortizes SMC inference for reward-tilted discrete diffusion by training a parameterized twist function on contrastive samples with closed-form kernels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20408","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Spectral Souping: A Unified Framework for Online Preference Alignment","primary_cat":"cs.LG","submitted_at":"2026-05-19T19:04:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Spectral Souping learns offline specialized policies for fine-grained preferences and merges them online using a discovered universal spectral representation for efficient LLM alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12288","ref_index":152,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TokenRatio: Principled Token-Level Preference Optimization via Ratio Matching","primary_cat":"cs.CL","submitted_at":"2026-05-12T15:44:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TBPO posits a token-level Bradley-Terry model and derives a Bregman-divergence density-ratio matching loss that generalizes DPO while preserving token-level optimality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10843","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Training-Free Cultural Alignment of Large Language Models via Persona Disagreement","primary_cat":"cs.CL","submitted_at":"2026-05-11T16:55:16+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DISCA converts within-country disagreement among World Values Survey personas into a bounded logit correction that reduces cultural misalignment by 10-24% on MultiTP for models 3.8B and larger across 20 countries, without any weight updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15306","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Generalization in LLM Problem Solving: The Case of the Shortest Path","primary_cat":"cs.AI","submitted_at":"2026-04-16T17:59:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs show strong spatial generalization to unseen maps in shortest-path tasks but fail length scaling due to recursive instability, with data coverage setting hard limits.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.06419","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reward Models Can Improve Themselves: Reward-Guided Adversarial Failure Mode Discovery for Robust Reward Modeling","primary_cat":"cs.CL","submitted_at":"2025-07-08T21:56:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"REFORM uses reward-guided controlled decoding to generate adversarial failures and augments training data to improve reward model robustness on preference datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.19075","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Universal Reasoner: A Single, Composable Plug-and-Play Reasoner for Frozen LLMs","primary_cat":"cs.AI","submitted_at":"2025-05-25T10:19:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UniR is a composable reasoning module trained with verifiable rewards and added to frozen LLMs via logit summation, enabling modular composition and weak-to-strong generalization across tasks and model sizes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.16559","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Synergistic Benefits of Joint Molecule Generation and Property Prediction","primary_cat":"cs.LG","submitted_at":"2025-04-23T09:36:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Hyformer jointly models molecule generation and property prediction via alternating attention and joint pre-training, showing synergistic gains in conditional sampling, OOD prediction, and a drug design case for antimicrobial peptides.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.08812","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TRAM: Test-Time Risk Adaptation with Mixture of Agents","primary_cat":"cs.LG","submitted_at":"2024-08-16T15:47:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TRAM is a test-time mixture method that scores and composes risk-neutral source policies using reward and occupancy-based risk to achieve new reward-risk tradeoffs without parameter updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.07199","ref_index":221,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Agent Q: Advanced Reasoning and Learning for Autonomous AI Agents","primary_cat":"cs.AI","submitted_at":"2024-08-13T20:52:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Agent Q integrates MCTS-guided search, self-critique, and off-policy DPO to train LLM agents that outperform behavior cloning and reinforced fine-tuning baselines in WebShop and achieve up to 95.4% success in real-world booking scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}