{"total":16,"items":[{"citing_arxiv_id":"2606.28589","ref_index":18,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Search for Truth from Reasoning: A Dynamic Representation Editing Framework for Steering LLM Trajectories","primary_cat":"cs.AI","submitted_at":"2026-06-26T20:33:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DynaSteer is a dynamic representation editing framework that uses pattern clustering, Fisher-LDA, and lookahead entropy monitoring to steer LLM reasoning trajectories toward truth on MATH and coding tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28186","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Cognitive Episodes in LLM Reasoning Traces Enable Interpretable Human Item Difficulty Prediction","primary_cat":"cs.CL","submitted_at":"2026-06-26T15:32:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Epi2Diff extracts cognitive episode sequences from LRM reasoning traces and combines them with semantic features to predict human item difficulty, outperforming baselines on four educational datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27981","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ToxiREX: A Dataset on Toxic REasoning in ConteXt","primary_cat":"cs.CL","submitted_at":"2026-06-26T11:30:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ToxiREX is a new dataset of 128k Reddit comments in six languages with hierarchical annotations for implicit toxicity in conversational context based on an existing reasoning schema.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25524","ref_index":4,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Cliff Tokens: Identifying Single-Token Failure Triggers in LLM Mathematical Reasoning","primary_cat":"cs.AI","submitted_at":"2026-06-24T08:03:24+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Cliff tokens are single tokens triggering LLM math reasoning failures, identified via adaptive z-test threshold on token potential; a taxonomy and Cliff-DPO optimization yield up to +6.6 accuracy gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18089","ref_index":129,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Reasoning Traces to Reusable Modules: Understanding Compositional Generalization in Language Model Reasoning","primary_cat":"cs.LG","submitted_at":"2026-06-16T15:55:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces a hierarchical latent selection model showing SFT supplies raw module materials in compound traces while RL decomposes them to identify atomic modules and enable recombination for new reasoning configurations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09932","ref_index":144,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When RL Fails after SFT: Rejuvenating Model Plasticity for Robust SFT-to-RL Handoff","primary_cat":"cs.LG","submitted_at":"2026-06-07T17:58:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Excessive SFT reduces LLM plasticity for RL; Rejuvenation restores it via base-anchored fusion and targeted neuron resets, yielding better RL performance and OOD generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05106","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Arithmetic Pedagogy for Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-03T17:09:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A small GPT-2 model trained from scratch on GASING-derived CoT supervision for arithmetic reaches over 80% held-out accuracy, exhibits three learning phases, and develops both procedural and associative reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04503","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Smart Picks in the Dark: Towards Efficient RLVR for Reasoning via Tracing Metacognitive Pivots","primary_cat":"cs.LG","submitted_at":"2026-06-03T06:34:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"PivotTrace selects unlabeled data for RLVR by quantifying uncertainty via pivot density from attention dynamics, outperforming full supervision using only 29.3% annotations and converging 2.75 times faster.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29192","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReasonOps: Operator Segmentation for LLM Reasoning Traces","primary_cat":"cs.AI","submitted_at":"2026-05-28T00:08:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Unsupervised clustering on sentence-initial 3-token pivots extracts 7 universal reasoning operators from 44k traces across 12 LLMs that enable model fingerprinting and answer-correctness prediction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25052","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Faithfulness Metrics Don't Measure Faithfulness: A Meta-Evaluation with Ground Truth","primary_cat":"cs.CL","submitted_at":"2026-05-24T12:57:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Introduces BonaFide benchmark of 3,066 ground-truth labeled CoTs showing most faithfulness metrics perform near chance with biases and poor scaling to longer chains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22870","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Readout Shortcut: Positional Number Copying Dominates Arithmetic CoT Readout in Small Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-20T00:32:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"In 1-3B instruction-tuned LMs on GSM8K, arithmetic CoT readout is dominated by positional copying of the trailing number before the answer delimiter, accounting for 54-92 percentage points of accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17187","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PluRule: A Benchmark for Moderating Pluralistic Communities on Social Media","primary_cat":"cs.CL","submitted_at":"2026-05-16T22:52:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PluRule is a new multimodal multilingual benchmark showing that state-of-the-art vision-language models perform only marginally better than a trivial baseline at detecting specific rule violations in pluralistic online communities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14457","ref_index":8,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Stateful Reasoning via Insight Replay","primary_cat":"cs.AI","submitted_at":"2026-05-14T06:52:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InsightReplay improves long CoT reasoning by extracting critical insights from the trace and replaying them near the active frontier, delivering +1.65 average accuracy gain across 24 model-benchmark settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22266","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Large Language Models Decide Early and Explain Later","primary_cat":"cs.CL","submitted_at":"2026-04-24T06:26:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs settle on their answer after a minority of CoT tokens and produce an average 760 more as post-decision explanation, enabling early stopping that saves 500 tokens per query at a 2% accuracy cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.22816","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Measuring and curing reasoning rigidity: from decorative chain-of-thought to genuine faithfulness","primary_cat":"cs.CL","submitted_at":"2026-03-24T05:38:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SLRC quantifies genuine step necessity in LLM reasoning as a causal estimator, LC-CoSR training reduces rigidity with stability guarantees, and evaluations reveal a faithfulness-sycophancy paradox across frontier models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.24941","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Can Aha Moments Be Fake? Towards Quantifying Decorative and True Thinking in Chain-of-Thought","primary_cat":"cs.LG","submitted_at":"2025-10-28T20:14:02+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}