{"total":58,"items":[{"citing_arxiv_id":"2605.30247","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OOD-GraphLLM: Graph Large Language Model for Out-of-Distribution Generalized Drug Synergy Prediction","primary_cat":"cs.LG","submitted_at":"2026-05-28T17:12:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OOD-GraphLLM is a graphLLM framework that jointly optimizes molecular graph representations and biomedical semantic language representations for out-of-distribution drug synergy prediction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22287","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SciCore-Mol: Augmenting Large Language Models with Pluggable Molecular Cognition Modules","primary_cat":"cs.AI","submitted_at":"2026-05-21T10:37:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SciCore-Mol augments LLMs with three integrated modules for molecular perception, latent diffusion generation, and reaction reasoning, claiming an 8B open model competes with or exceeds proprietary systems on chemical tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21102","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ACL-Verbatim: hallucination-free question answering for research","primary_cat":"cs.CL","submitted_at":"2026-05-20T12:30:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The work creates a new ground truth dataset for mapping queries to verbatim text spans in research papers and shows a 150M-parameter ModernBERT token classifier achieving 53.6 word-level F1, outperforming LLM extractors at 48.7.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20591","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Do No Harm? Hallucination and Actor-Level Abuse in Web-Deployed Medical Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-20T00:57:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Evaluation of 6233 MedGPTs finds 25-30% with low factual accuracy, 33.6-54.3% violating operational thresholds, and 57% of action-enabled models lacking privacy disclosures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20369","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DEL: Digit Entropy Loss for Numerical Learning of Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-19T18:18:59+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DEL is a new loss for LLM numerical learning that applies supervised digit entropy optimization and extends to floating-point numbers, showing improved accuracy and distance metrics over prior methods on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17261","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unlocking Biological Workflows for Robust Protein-Text Question Answering: A Dual-Dimensional RAG Framework","primary_cat":"cs.IR","submitted_at":"2026-05-17T05:03:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"2D-ProteinRAG is a dual-dimensional RAG framework that incorporates BLAST workflows plus horizontal attribute alignment and vertical homology denoising to improve protein-text QA on both in-distribution and out-of-distribution cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17152","ref_index":153,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multilingual and Multimodal LLMs in the Wild: Building for Low-Resource Languages","primary_cat":"cs.CL","submitted_at":"2026-05-16T20:56:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A tutorial synthesizing foundations, recent models such as PALO and Maya, and low-cost methods for tri-modal multilingual AI in resource-constrained settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12813","ref_index":125,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"REALISTA: Realistic Latent Adversarial Attacks that Elicit LLM Hallucinations","primary_cat":"cs.CL","submitted_at":"2026-05-12T23:13:50+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10862","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RUBEN: Rule-Based Explanations for Retrieval-Augmented LLM Systems","primary_cat":"cs.CL","submitted_at":"2026-05-11T17:10:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RUBEN discovers minimal rule sets explaining RAG LLM outputs via novel pruning and applies them to evaluate LLM safety against adversarial injections.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09915","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Position: Academic Conferences are Potentially Facing Denominator Gaming Caused by Fully Automated Scientific Agents","primary_cat":"cs.CL","submitted_at":"2026-05-11T03:07:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Malicious actors could use AI agents to submit large numbers of fake papers, inflating the submission count and thereby raising the acceptance odds for a small set of chosen legitimate papers under stable conference acceptance rates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08924","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PPI2Text: Captioning Protein-Protein Interactions with Coordinate-Aligned Pair-Map Decoding","primary_cat":"cs.CE","submitted_at":"2026-05-09T12:49:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PPI2Text generates natural-language captions for protein-protein interactions from sequences by encoding each protein with ESM3, building a residue-pair map, and decoding with Qwen3 using coordinate-aligned positional encoding.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"P∈ {A, B} is tied to a dedicated spatial channel and the pair-map sits at the intersection of the two protein channels. As illustrated in the Figure 3, every text or multimodal token is mapped to a unique 3D position IDs (pT , pθ, pφ)as follows: 6 Text tokens: For text token at position n in the full sequence of prompt plus the generated answer, we apply: (pT , pθ, pφ) = (n, n, n)(4) We collapse the 3D-RoPE to its canonical 1D form where all three channels are filled with their actual positions in the sequence. The behavior falls back to the native settings of Qwen3 model, inheriting the pretrained natural language ability. Single-protein tokens (protein A): For single-protein representation tokens of protein A (XA) that is at position n in the full sequence but represents continual residues centered at position i in the"},{"citing_arxiv_id":"2605.07208","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"FAME: Forecasting Academic Impact via Continuous-Time Manifold Evolution","primary_cat":"cs.LG","submitted_at":"2026-05-08T03:57:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FAME models scientific topic trajectories in continuous time to forecast paper impact more accurately than LLMs by aligning manuscripts with field momentum in a dynamic latent space.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[26] Aaditya Singh, Adam Fry, Adam Perelman, Adam Tart, Adi Ganesh, et al. Openai gpt-5 system card.arXiv preprint arXiv:2601.03267, 2025. [27] Alex J Smola and Bernhard Schölkopf. A tutorial on support vector regression.Statistics and computing, 14(3):199-222, 2004. [28] Jiabin Tang, Lianghao Xia, Zhonghang Li, and Chao Huang. Ai-researcher: Autonomous scientific innovation.arXiv preprint arXiv:2505.18705, 2025. [29] Ross Taylor, Marcin Kardas, Guillem Cucurull, Thomas Scialom, Anthony Hartshorn, Elvis Saravia, Andrew Poulton, Viktor Kerkez, and Robert Stojnic. Galactica: A large language model for science.arXiv preprint arXiv:2211.09085, 2022. [30] Jingqi Tong, Mingzhe Li, Hangcheng Li, Yongzhuo Yang, Yurong Mou, Weijie Ma, Zhiheng Xi, Hongji Chen, Xiaoran Liu, Qinyuan Cheng, et al."},{"citing_arxiv_id":"2605.06651","ref_index":5,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AI co-mathematician: Accelerating mathematicians with agentic AI","primary_cat":"cs.AI","submitted_at":"2026-05-07T17:56:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"An interactive AI workbench for mathematicians achieves 48% on FrontierMath Tier 4 and helped solve open problems in early tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05573","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AstroAlertBench: Evaluating the Accuracy, Reasoning, and Honesty of Multimodal LLMs in Astronomical Classification","primary_cat":"astro-ph.IM","submitted_at":"2026-05-07T01:36:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AstroAlertBench evaluates multimodal LLMs on astronomical classification accuracy, reasoning, and honesty using real ZTF alerts, revealing that high accuracy often diverges from self-assessed reasoning quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05546","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SPARK: Self-Play with Asymmetric Reward from Knowledge Graphs","primary_cat":"cs.AI","submitted_at":"2026-05-07T00:51:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPARK constructs unified knowledge graphs from multi-document scientific literature to ground self-play RL with asymmetric roles and verifiable rewards, outperforming flat-corpus baselines especially on longer-hop reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03515","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Scale-Dependent Input Representation and Confidence Estimation for LLMs in Materials Property Prediction","primary_cat":"cond-mat.mtrl-sci","submitted_at":"2026-05-05T08:52:14+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Larger LLMs handle detailed crystal descriptions better than small ones, and mean negative log-likelihood of predicted numbers tracks prediction error after fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02745","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Bolek: A Multimodal Language Model for Molecular Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-04T15:46:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Bolek injects Morgan fingerprint embeddings into an instruction-tuned text model, then fine-tunes on molecular alignment and synthetic chain-of-thought tasks to improve performance and grounding on 15 TDC binary classification endpoints while generalizing to unseen tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"against descriptions over∼280k PubChemSTM pairs; MoMu [26] extends the same contrastive paradigm to graph encoders; FineMolTex [27] combines coarse contrastive alignment with fine-grained masked motif-word matching. LLM-native multimodal models bridge a structural encoder to a language model through a learned projector, mir- roring LLaV A/BLIP-2-style designs: MolCA [17] bridges a graph encoder to Galactica [28] through a Q-Former with a LoRA adapter; InstructMol [18] adds a two-stage recipe with alignment pretraining on330K PubChem pairs; LLaMo [19], 3D-MoLM [29], BioMedGPT [30], and GIT-Mol [31] extend the paradigm to richer 2D/3D inputs and multi-level token pools; MolX [20] additionally injects a fingerprint signal, but as an auxiliary feature alongside a"},{"citing_arxiv_id":"2604.27351","ref_index":64,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Heterogeneous Scientific Foundation Model Collaboration","primary_cat":"cs.AI","submitted_at":"2026-04-30T03:02:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Eywa enables language-based agentic AI systems to collaborate with specialized scientific foundation models for improved performance on structured data tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"InProceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 8783-8817, 2024. [63] Ross Taylor, Marcin Kardas, Guillem Cucurull, Thomas Scialom, Anthony Hartshorn, Elvis Saravia, Andrew Poulton, Viktor Kerkez, and Robert Stojnic. Galactica: A large language model for science. arXiv preprint arXiv:2211.09085, 2022. [64] AitorLewkowycz,AndersAndreassen,DavidDohan,EthanDyer,HenrykMichalewski,VinayRamasesh, Ambrose Slone, Cem Anil, Imanol Schlag, Theo Gutman-Solo, et al. Solving quantitative reasoning problems with language models.Advances in neural information processing systems, 35:3843-3857, 2022. [65] Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon, and Tie-Yan Liu."},{"citing_arxiv_id":"2604.24645","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"K-MetBench: A Multi-Dimensional Benchmark for Fine-Grained Evaluation of Expert Reasoning, Locality, and Multimodality in Meteorology","primary_cat":"cs.CL","submitted_at":"2026-04-27T16:13:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"K-MetBench shows LLMs have large gaps in interpreting meteorology diagrams and Korean-specific context, with smaller local models beating much larger global ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18936","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Fine-Tuning Small Reasoning Models for Quantum Field Theory","primary_cat":"cs.LG","submitted_at":"2026-04-21T00:21:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Small 7B reasoning models were fine-tuned on synthetic and curated QFT problems using RL and SFT, yielding performance gains, error analysis, and public release of data and traces.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"and reasoning abilities. Minerva [1] demonstrated that training on scientific corpora containing LATEX signif- icantly improves performance on STEM benchmarks. By fine-tuning PaLM on arXiv papers and web pages, Minerva achieved state-of-the-art results on the MATH dataset, employing majority voting to mitigate cal- culation errors. Similarly, Galactica [2] was trained on a massive corpus of scientific knowledge to act as an interface for science, using specialized tokens for citations and reasoning steps, though it faced challenges with hallucination. 3 Beyond these general-purpose scientific models, dedicated mathematical LLMs such as Llemma [14] have shown that continued pretraining on mathematical corpora yields strong performance on formal and"},{"citing_arxiv_id":"2604.18176","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"QuantumQA: Enhancing Scientific Reasoning via Physics-Consistent Dataset and Verification-Aware Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2026-04-20T12:33:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"QuantumQA dataset and verification-aware RL with adaptive reward fusion enable an 8B LLM to achieve performance competitive with proprietary models on quantum mechanics tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10718","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SciPredict: Can LLMs Predict the Outcomes of Scientific Experiments in Natural Sciences?","primary_cat":"cs.AI","submitted_at":"2026-04-12T16:28:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLMs predict outcomes of real scientific experiments at 14-26% accuracy, comparable to human experts, but lack calibration on prediction reliability while humans demonstrate strong calibration.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"which hypotheses to test and parameter regimes to pursue underresourceconstraints. Inawetlab,forinstance,choosing therightconditionsforaproteincrystallizationexperimentcan mean the difference between months of productive research and a dead end [9]. In materials science, predicting which synthesis parameters will yield a desired property helps avoid costlytrial-and-error[ 39]. Eveninfundamentalphysics,identi- fyingwhichparameterregimesmeritexperimentalexploration shapes how we allocate beam time at particle accelerators and space on satellites. A system that could reliably predict the experimental results would reshape the scientific process, accelerating discovery by filtering out suboptimal directions, identifying gaps in current frameworks, and suggesting much"},{"citing_arxiv_id":"2605.02919","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Heterogeneous Graph Importance Scoring and Clustering with Automated LLM-based Interpretation","primary_cat":"cs.LG","submitted_at":"2026-04-09T18:34:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"An open-data pipeline constructs heterogeneous graphs from OSM, computes five social impact scores per bridge, applies UMAP+HDBSCAN clustering to find archetypes, and uses domain-tuned LLMs to generate policy interpretations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06788","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Perception to Autonomous Computational Modeling: A Multi-Agent Approach","primary_cat":"cs.CE","submitted_at":"2026-04-08T07:56:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A multi-agent LLM framework autonomously completes the full computational mechanics pipeline from a photograph to a code-compliant engineering report on a steel L-bracket example.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"state?). At every stage, the engineer infers as much as possible from the available information to reduce uncertainty, and the framework presented here aims to replicate this inference process autonomously, while making every decision transparent and traceable. Large language models (LLMs) now demonstrate code generation [1, 2], scientific reasoning [3, 4], and multi-modal understanding [5]. In computational mechanics specifically: constitutive discovery [6], finite element analysis (FEA) code generation [7], next-generation computer-aided engineering (CAE) [8], end-to-end FEA [9, 10]. Physics-informed neural networks (PINNs) [11, 12], neural operators [13, 14], and constitutive artificial neural networks (CANNs) [6] engage with"},{"citing_arxiv_id":"2604.04722","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Don't Waste Bits! Adaptive KV-Cache Quantization for Lightweight On-Device LLMs","primary_cat":"cs.CV","submitted_at":"2026-04-06T14:45:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A data-driven adaptive policy for KV-cache bit-width selection based on token importance features reduces decoding latency by ~18% and improves accuracy over static quantization while staying near FP16 levels on SmolLM models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04403","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MolDA: Molecular Understanding and Generation via Large Language Diffusion Model","primary_cat":"cs.AI","submitted_at":"2026-04-06T04:04:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MolDA is a multimodal molecular model that uses a discrete large language diffusion backbone plus a hybrid graph encoder to achieve better global coherence and validity than autoregressive approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04074","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FactReview: Evidence-Grounded Reviews with Literature Positioning and Execution-Based Claim Verification","primary_cat":"cs.AI","submitted_at":"2026-04-05T11:45:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FactReview extracts claims from ML papers, positions them via literature retrieval, and verifies them through code execution, labeling each as Supported, Partially supported, or In conflict, as shown in a CompGCN case study.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.01965","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Do We Need Bigger Models for Science? Task-Aware Retrieval with Small Language Models","primary_cat":"cs.IR","submitted_at":"2026-04-02T12:28:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Task-aware retrieval with small models partially compensates for reduced scale in scholarly QA but model capacity remains important for complex reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.08022","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Capacity-Aware Mixture Law Enables Efficient LLM Data Optimization","primary_cat":"cs.LG","submitted_at":"2026-03-09T06:58:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAMEL is a scaling law capturing nonlinear model-size and mixture interactions to extrapolate optimal data mixtures for large LLMs from small-model experiments, reducing optimization cost by 50% and improving benchmarks by up to 3%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.20816","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Don't Ignore the Tail: Decoupling top-K Probabilities for Efficient Language Model Distillation","primary_cat":"cs.CL","submitted_at":"2026-02-24T11:54:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A modified divergence decouples top-K teacher probabilities from the distribution tail during distillation, yielding competitive performance on decoder models with standard compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.15037","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CircuChain: Disentangling Competence and Compliance in LLM Circuit Analysis","primary_cat":"cs.SE","submitted_at":"2026-01-29T06:13:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Stronger LLMs show near-perfect physical reasoning in circuits but violate explicit sign and polarity instructions in trap setups, while weaker models follow instructions better but reason less accurately.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.00264","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"S1-MMAlign: A Large-Scale, Multi-Disciplinary Dataset for Scientific Figure-Text Understanding","primary_cat":"cs.CV","submitted_at":"2026-01-01T08:54:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"S1-MMAlign is a new large-scale dataset of 15.5 million semantically enhanced scientific image-text pairs created via an AI recaptioning pipeline to improve multimodal understanding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.08804","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MOSAIC: Multi-agent Orchestration for Task-Intelligent Scientific Coding","primary_cat":"cs.CL","submitted_at":"2025-10-09T20:35:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MOSAIC is a training-free multi-agent LLM framework with rationale, coding, reflection, and debugging agents plus a consolidated context window that outperforms prior methods on scientific coding benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.20374","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CFDLLMBench: A Benchmark Suite for Evaluating Large Language Models in Computational Fluid Dynamics","primary_cat":"cs.CL","submitted_at":"2025-09-19T22:21:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CFDLLMBench is a new benchmark suite with CFDQuery, CFDCodeBench, and FoamBench to evaluate LLMs on graduate-level CFD knowledge, numerical reasoning, and context-dependent code implementation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.07177","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards EnergyGPT: A Large Language Model Specialized for the Energy Sector","primary_cat":"cs.CL","submitted_at":"2025-09-08T19:48:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Fine-tuned LLaMA 3.1-8B variants for the energy sector outperform the base model on domain QA benchmarks, with LoRA delivering similar gains at lower training cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.21990","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ChemDFM-R: A Chemical Reasoning LLM Enhanced with Atomized Chemical Knowledge","primary_cat":"cs.CE","submitted_at":"2025-07-29T16:40:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ChemDFM-R is a chemical reasoning LLM trained via a four-stage pipeline on the ChemFG dataset of functional-group annotations for molecules and reactions, reaching performance comparable to or better than commercial models on chemical benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.10465","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Superposition Yields Robust Neural Scaling","primary_cat":"cs.LG","submitted_at":"2025-05-15T16:18:13+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Strong superposition causes neural loss to scale as the inverse of model dimension due to geometric feature overlaps, explaining scaling laws for broad frequency distributions.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"arXiv preprint arXiv:2307.15016, 2023. https://arxiv.org/abs/2307.15016. [7] Aitor Lewkowycz, Anders Andreassen, David Dohan, Ethan Dyer, Henryk Michalewski, Vinay Ramasesh, Ambrose Slone, Cem Anil, Imanol Schlag, Theo Gutman-Solo, et al. Solving quan- titative reasoning problems with language models.Advances in neural information processing systems, 35:3843-3857, 2022. [8] Ross Taylor, Marcin Kardas, Guillem Cucurull, Thomas Scialom, Anthony Hartshorn, Elvis Saravia, Andrew Poulton, Viktor Kerkez, and Robert Stojnic. Galactica: A large language model for science.arXiv preprint arXiv:2211.09085, 2022. [9] Stephen Wolfram. Wolfram|alpha as the computation engine for gpt models, 2023. https://www.wolfram.com/wolfram-alpha-openai-plugin."},{"citing_arxiv_id":"2505.04588","ref_index":36,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ZeroSearch: Incentivize the Search Capability of LLMs without Searching","primary_cat":"cs.CL","submitted_at":"2025-05-07T17:30:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ZeroSearch uses supervised fine-tuning to create a simulated retrieval module and curriculum-based RL rollouts that degrade document quality to train LLMs on search capabilities without real search API calls.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.02871","ref_index":181,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Position: Multimodal Large Language Models Can Significantly Advance Scientific Reasoning","primary_cat":"cs.CL","submitted_at":"2025-02-05T04:05:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Position paper claims multimodal LLMs can significantly advance scientific reasoning and proposes a four-stage roadmap plus challenges and suggestions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.02737","ref_index":227,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SmolLM2: When Smol Goes Big -- Data-Centric Training of a Small Language Model","primary_cat":"cs.CL","submitted_at":"2025-02-04T21:43:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SmolLM2 is a 1.7B-parameter language model that outperforms Qwen2.5-1.5B and Llama3.2-1B after overtraining on 11 trillion tokens using custom FineMath, Stack-Edu, and SmolTalk datasets in a multi-stage pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.18084","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Property Enhanced Instruction Tuning for Multi-task Molecule Generation with Large Language Models","primary_cat":"cs.AI","submitted_at":"2024-12-24T01:48:07+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.14642","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Speak-to-Structure: Evaluating LLMs in Open-domain Natural Language-Driven Molecule Generation","primary_cat":"cs.CL","submitted_at":"2024-12-19T08:51:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"S^2-Bench is a new one-to-many benchmark for natural language-driven molecule generation with three tasks, and OpenMolIns is an instruction dataset enabling Llama3.1-8B to outperform GPT-4o and Claude-3.5 on it.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2411.14721","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MolReFlect: Towards In-Context Fine-grained Alignments between Molecules and Texts","primary_cat":"cs.CL","submitted_at":"2024-11-22T04:28:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MolReFlect introduces a teacher-student framework that automatically creates fine-grained molecule-text alignments to achieve SOTA results on molecule-caption translation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.14702","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Polymath: A Challenging Multi-modal Mathematical Reasoning Benchmark","primary_cat":"cs.AI","submitted_at":"2024-10-06T20:35:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PolyMATH is a new 5,000-image benchmark where top MLLMs reach at most 41 percent accuracy on multi-modal mathematical reasoning, with ablation showing minimal gain from text over images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2407.20595","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HALvest-Contrastive: Retrieval-Like Authorship Attribution with Patch-Level Late Interaction","primary_cat":"cs.DL","submitted_at":"2024-07-30T07:14:04+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.17557","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale","primary_cat":"cs.CL","submitted_at":"2024-06-25T13:50:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FineWeb is a curated 15T-token web dataset that produces stronger LLMs than prior open collections, while its educational subset sharply improves performance on MMLU and ARC benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Training language models to follow instructions with human feedback, 2022. [17] Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. Palm: Scaling language modeling with pathways. Journal of Machine Learning Research, 24(240): 1-113, 2023. [18] Ross Taylor, Marcin Kardas, Guillem Cucurull, Thomas Scialom, Anthony Hartshorn, Elvis Saravia, Andrew Poulton, Viktor Kerkez, and Robert Stojnic. Galactica: A large language model for science. arXiv preprint arXiv:2211.09085, 2022. [19] Trieu H. Trinh and Quoc V . Le. A simple method for commonsense reasoning, 2019. [20] Alec Radford, Jeff Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever."},{"citing_arxiv_id":"2403.03920","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enhancing Instructional Quality: Leveraging Computer-Assisted Textual Analysis to Generate In-Depth Insights from Educational Artifacts","primary_cat":"cs.AI","submitted_at":"2024-03-06T18:29:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"AI and NLP applied to educational artifacts within the Instructional Core Framework can identify advantages for teacher coaching, student support, and personalized learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.06196","ref_index":87,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Large Language Models: A Survey","primary_cat":"cs.CL","submitted_at":"2024-02-09T05:37:09+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The paper surveys key large language models, their training methods, datasets, evaluation benchmarks, and future research directions in the field.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"that for compute-optimal training, the model size and the number of training tokens should be scaled equally: for every doubling of model size the number of training tokens should also be doubled. They tested this hypothesis by training a predicted compute-optimal model, Chinchilla, that uses the same compute budget as Gopher but with 70B parameters and 4% more more data. Galactica: In [87], Taylor et al. introduced Galactica, a large language model that can store, combine and reason about scientific knowledge. They trained on a large scientific corpus of papers, reference material, knowledge bases and many other sources. Galactica performed well on reasoning, outperforming Chinchilla on mathematical MMLU by 41.3% to 35.7%, and PaLM 540B on MATH with a score of 20."},{"citing_arxiv_id":"2402.02750","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache","primary_cat":"cs.CL","submitted_at":"2024-02-05T06:06:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"KIVI applies asymmetric 2-bit quantization to KV cache with per-channel keys and per-token values, reducing memory 2.6x and boosting throughput up to 3.47x with near-identical quality on Llama, Falcon, and Mistral.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.12284","ref_index":68,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MetaMath: Bootstrap Your Own Mathematical Questions for Large Language Models","primary_cat":"cs.CL","submitted_at":"2023-09-21T17:45:42+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Bootstrapping math questions via rewriting creates MetaMathQA; fine-tuning LLaMA-2 on it yields 66.4% on GSM8K for 7B and 82.3% for 70B, beating prior same-size models by large margins.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"InternLM [29] 7B 31.2 - GPT-J [71] 6B 34.9 - ChatGLM 2 [81] 6B 32.4 - Qwen [1] 7B 51.6 - Baichuan-2 [4] 7B 24.5 5.6 SFT [70] 7B 41.6 - RFT [79] 7B 50.3 - MAmooTH-CoT [80] 7B 50.5 10.4 WizardMath [43] 7B 54.9 10.7 MetaMath 7B 66.5 19.8 open-source models (11-50B) LLaMA-2 [70] 13B 28.7 3.9 LLaMA-2 [70] 34B 42.2 6.2 MPT [49] 30B 15.2 3.1 Falcon [57] 40B 19.6 2.5 GAL [68] 30B - 12.7 Platypus [50] 13B 25.7 2.5 Orca-Platypus [50] 13B 38.4 3.0 Vicuna [11] 13B 27.6 - Code-LLaMA [61] 13B 36.1 16.4 Baichuan-2 [4] 13B 52.8 10.1 SFT [70] 13B 50.0 - RFT [79] 13B 54.8 - MAmooTH-CoT [80] 13B 56.3 12.9 WizardMath [43] 13B 63.9 14.0 MetaMath 13B 72.3 22.4 open-source models (51-70B) LLaMA-2 [70] 70B 56.8 13.5 RFT [79] 70B 64.8 -"}],"limit":50,"offset":0}