{"total":39,"items":[{"citing_arxiv_id":"2606.13111","ref_index":113,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"M\\\"OVE: A Holistic LLM Benchmark for the German Public Sector","primary_cat":"cs.CL","submitted_at":"2026-06-11T09:37:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MÖVE presents a new German-language benchmark evaluating 39 LLMs on performance and governance criteria using ten public-administration datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10660","ref_index":6,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Accounting for AI Inference in Corporate GHG Inventories: A Four-Tier Methodology for Scope 3 Category 1 Reporting","primary_cat":"cs.CY","submitted_at":"2026-06-09T10:08:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A four-tier framework for AI inference GHG emissions in Scope 3 reporting, progressing from direct physical estimation using GPU benchmarks to EEIO spend-based methods, with a case showing low total emissions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23701","ref_index":58,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Evaluating LLM Usage for Efficient and Explainable Numerical and Classified Implicit Sentiment Analysis of Product Desirability","primary_cat":"cs.CL","submitted_at":"2026-06-04T21:16:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"LLMs achieve Pearson correlations up to 0.97 and 94% classification accuracy on product desirability sentiment from qualitative data, outperforming lexicon and transformer baselines while providing confidence ratings and rationales.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02971","ref_index":95,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"EURO-5K: When Does Domain Pretraining Matter? Benchmarking Transformers for EU Reporting Obligation Extraction","primary_cat":"cs.CL","submitted_at":"2026-06-02T00:20:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces EURO-5K dataset from 136 EU acts and benchmarks full fine-tuning vs QLoRA for BERT and LLM models on reporting obligation extraction, reporting 0.89 F1 with limited gains from legal pretraining except under parameter-efficient adaptation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07632","ref_index":122,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Evaluation of ML Resource Utilization Requires Model Life Cycle Assessment","primary_cat":"cs.LG","submitted_at":"2026-05-31T05:58:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper calls for life cycle assessment to capture embodied hardware costs and full pipeline operational costs in AI development and deployment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18012","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SAS: Semantic-aware Sampling for Generative Dataset Distillation","primary_cat":"cs.CV","submitted_at":"2026-05-18T08:05:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SAS adds semantic scoring with CLIP and a two-stage filter-then-diversity selection process to make generative dataset distillation produce more class-discriminative and diverse compact datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18889","ref_index":2,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Soft Learning","primary_cat":"cs.LG","submitted_at":"2026-05-16T22:14:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Soft Learning optimally combines heterogeneous ML specialists via cross-validated non-negative least squares, achieving top performance on 70% of 37 datasets with formal guarantees and 72-435x CPU speedups over deep networks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17159","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MADP: A Multi-Agent Pipeline for Sustainable Document Processing with Human-in-the-Loop","primary_cat":"cs.AI","submitted_at":"2026-05-16T21:18:39+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MADP multi-agent pipeline with human-in-the-loop achieves 97% full automation on 955 real documents, 98.5% accuracy on ablation set, and 69-70% reductions in FTE, energy, and emissions versus manual processing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14249","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EnergyLens: Predictive Energy-Aware Exploration for Multi-GPU LLM Inference Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-14T01:37:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EnergyLens predicts multi-GPU LLM inference energy consumption with 9-13% MAPE and identifies configurations with up to 52x energy efficiency differences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11733","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Position: LLM Inference Should Be Evaluated as Energy-to-Token Production","primary_cat":"cs.CE","submitted_at":"2026-05-12T08:15:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM inference should be reframed and evaluated as energy-to-token production with a Token Production Function that accounts for power, cooling, and efficiency ceilings.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"machine learning, 2019. URL https://arxiv.org/abs/1910.09700 . arXiv preprint arXiv:1910.09700. [13] E. Strubell, A. Ganesh, and A. McCallum. Energy and policy considerations for deep learning in nlp. InProceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL), pages 3645-3650, 2019. URLhttps://arxiv.org/abs/1906.02243. [14] A. S. Luccioni, Y . Jernite, and E. Strubell. Power hungry processing: Watts driving the cost of ai deployment? InProceedings of the 2024 ACM Conference on Fairness, Accountability, and Transparency (F AccT), 2024. doi: 10.1145/3630106.3658542. URL h t t p s : //doi.org/10.1145/3630106.3658542. [15] MLCommons. Mlperf inference v4.1 power results. Technical report, MLCommons, 2024."},{"citing_arxiv_id":"2605.22840","ref_index":19,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"The Cognitive Kardashev Scale: Quantifying the Material Envelope of Civilisational Computation","primary_cat":"physics.soc-ph","submitted_at":"2026-05-11T02:43:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Defines a Cognitive Kardashev Scale using total power, cognitive fraction f, compute efficiency η, and brain reference to place current humanity at K ≈ 0.73 and estimate Type I/II capacities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07096","ref_index":59,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Query-efficient model evaluation using cached responses","primary_cat":"cs.LG","submitted_at":"2026-05-08T01:24:06+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06856","ref_index":202,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Benchmarked Yet Not Measured -- Generative AI Should be Evaluated Against Real-World Utility","primary_cat":"cs.LG","submitted_at":"2026-05-07T18:56:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Generative AI evaluation must shift from static benchmark scores to measuring sustained improvements in human capabilities within specific deployment contexts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04901","ref_index":27,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"On the (In-)Security of the Shuffling Defense in the Transformer Secure Inference","primary_cat":"cs.CR","submitted_at":"2026-05-06T13:31:15+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An attack aligns differently shuffled intermediate activations from secure Transformer inference queries to recover model weights with low error using roughly one dollar of queries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02300","ref_index":276,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Meta Reinforcement Learning Approach to Goals-Based Wealth Management","primary_cat":"cs.LG","submitted_at":"2026-05-04T07:48:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MetaRL pre-trained on GBWM problems delivers near-optimal dynamic strategies in 0.01s achieving 97.8% of DP optimal utility and handles larger problems where DP fails.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01158","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Hidden Cost of Thinking: Energy Use and Environmental Impact of LMs Beyond Pretraining","primary_cat":"cs.CY","submitted_at":"2026-05-01T23:24:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Full development of 7B and 32B Olmo 3 models used 12.3 GWh datacenter energy and emitted 4,251 tCO2eq, with development overheads accounting for 82% of compute and reasoning models costing 17x more to post-train than instruction-tuned ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24805","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"minAction.net: Energy-First Neural Architecture Design -- From Biological Principles to Systematic Validation","primary_cat":"cs.LG","submitted_at":"2026-04-27T06:26:36+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Large-scale experiments show architecture performance depends on task type, not universality, and a single-parameter energy penalty reduces computational energy by ~1000x with negligible accuracy cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23139","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"GreenDyGNN: Runtime-Adaptive Energy-Efficient Communication for Distributed GNN Training","primary_cat":"cs.DC","submitted_at":"2026-04-25T04:43:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GreenDyGNN applies Double-DQN to adapt cache management in distributed GNN training, cutting energy by up to 43% under congestion versus static policies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14287","ref_index":116,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Quantum-inspired tensor networks in machine learning models","primary_cat":"cs.LG","submitted_at":"2026-04-15T18:00:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Tensor networks developed for quantum states are reviewed as tools for machine learning models, with assessment of their potential computational, explanatory, and privacy advantages alongside remaining challenges.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16291","ref_index":96,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"AI of the People, by the People, for the People: A Social Choice Approach to Collective Control of Artificial Intelligence","primary_cat":"cs.CY","submitted_at":"2026-04-14T07:42:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Proposes applying social choice theory as a modeling language and axiomatic tool for incorporating collective input across the ML development pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06732","ref_index":2,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Extraction of linearized models from pre-trained networks via knowledge distillation","primary_cat":"cs.LG","submitted_at":"2026-04-08T06:53:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Koopman theory plus knowledge distillation yields linearized models from pre-trained nets that outperform standard least-squares Koopman approximations on MNIST and Fashion-MNIST in accuracy and stability.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"We believe the current work will contribute to future research exploring the practical application of s uch hardware. 8 ACKNOWLEDGMENTS This work was ﬁnancially supported in part by grants awarded to JO (JSPS KAKENHI Grant Number JP25H01880). [1] Y . LeCun, Y . Bengio, and G. Hinton, \"Deep learning,\" Nature, vol. 521, no. 7553, pp. 436-444, May 2015. DOI:10.1038/nature14539 [2] E. Strubell, A. Ganesh, and A. McCallum. \"Energy and pol- icy considerations for deep learning in NLP ,\" Proc. 57th Ann . Meeting Assoc. Comp. Linguistics, pp. 3645-3650, July 2019 . DOI:10.18653/v1/P19-1355 [3] E. Strubell, A. Ganesh, and A. McCallum. \"Energy and policy considerations for modern deep learning research,\" Proc. A AAI Conf. Artiﬁcial Intelligence, vol."},{"citing_arxiv_id":"2604.16353","ref_index":38,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"AgriIR: A Scalable Framework for Domain-Specific Knowledge Retrieval","primary_cat":"cs.IR","submitted_at":"2026-03-17T05:14:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"AgriIR is a configurable RAG framework using modular stages and 1B-parameter models to deliver grounded, citable answers for Indian agricultural information access.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.16951","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Minimum-Action Learning: Energy-Constrained Symbolic Model Selection for Physical Law Identification from Noisy Data","primary_cat":"cs.LG","submitted_at":"2026-03-16T20:45:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAL recovers correct symbolic force laws like Kepler gravity from noisy data by minimizing trajectory reconstruction, sparsity, and energy violation, reaching 100% identification via energy criterion on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.14235","ref_index":112,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Spiking Neural Network Architecture Search: A Survey","primary_cat":"cs.NE","submitted_at":"2025-10-16T02:27:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A survey of Spiking Neural Network architecture search techniques viewed through a hardware/software co-design lens.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"new hardware architectures to ensure designs are feasible, efficient, and well-aligned with real-world deployment con- straints [94]. This approach typically delivers better perfor- mance and lower costs than separately optimizing hardware or SNNs, but at the expense of greater search complexity, substantial GPU hours, and a large carbon footprint [112]. ANAS [77] exemplifies co-exploration by using an evolu- tionary algorithm to search a comprehensive design space, evaluating hardware performance with the CanMore simulator. Compared to random or grid search, ANAS achieves up to an order-of-magnitude improvement in energy-delay and is four orders of magnitude faster to search. Similarly, ANCoEF [84]"},{"citing_arxiv_id":"2509.24517","ref_index":41,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Physics Priors Offer Useful Accuracy-Carbon Trade-Offs in Spatio-Temporal Forecasting","primary_cat":"cs.LG","submitted_at":"2025-09-29T09:34:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Stronger physics priors in neural networks for spatio-temporal shear flow forecasting yield substantially lower training carbon footprints than weak or no priors, though inference savings are less consistent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.06816","ref_index":132,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How do datasets, developers, and models affect biases in a low-resourced language?: The Case of the Bengali Language","primary_cat":"cs.CL","submitted_at":"2025-06-07T14:46:35+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Bengali sentiment analysis models exhibit persistent identity-based biases across datasets and developer backgrounds despite similar semantic content.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"derstand racial, ethnic, and religious minorities' dialects [81] or clas- sify their linguistic practices as negative and abusive [ 37, 42, 109]. Researchers previously examined the biases of computational sys- tems across different social identity dimensions [ 15, 87], such as gender [72], race [ 109], nationality [ 134], religion [ 12], caste [ 6], age [ 44], occupation [ 132], disability [ 135], and political affilia- tions [1]. Such biases can be put into three categories [ 58]: preex- isting, technical, and emergent. Preexisting bias has its roots in social institutions, practices, and prejudicial attitudes, which can be reinforced in sociotechnical sys- tems through various means. For example, researchers studied how"},{"citing_arxiv_id":"2405.00892","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Wake Vision: A Tailored Dataset and Benchmark Suite for TinyML Computer Vision Applications","primary_cat":"cs.CV","submitted_at":"2024-05-01T22:33:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Wake Vision pipeline produces a 6M-image person detection dataset for TinyML with 2.2% label error, improving model accuracy up to 6.6% over prior VWW benchmark across architectures and subsets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.17493","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Curse of Recursion: Training on Generated Data Makes Models Forget","primary_cat":"cs.LG","submitted_at":"2023-05-27T15:10:41+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Use of model-generated content in training causes irreversible loss of distribution tails, termed model collapse, in VAEs, GMMs, and LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2205.01068","ref_index":290,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OPT: Open Pre-trained Transformer Language Models","primary_cat":"cs.CL","submitted_at":"2022-05-02T17:49:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPT releases open decoder-only transformers up to 175B parameters that match GPT-3 performance at one-seventh the carbon cost, along with code and training logs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2204.14198","ref_index":106,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Flamingo: a Visual Language Model for Few-Shot Learning","primary_cat":"cs.CV","submitted_at":"2022-04-29T16:29:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Flamingo models reach new state-of-the-art few-shot results on image and video tasks by bridging frozen vision and language models with cross-attention layers trained on interleaved web-scale data.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"Conference on Neural Information Processing Systems, 2017. [104] David R So, Wojciech Ma 'nke, Hanxiao Liu, Zihang Dai, Noam Shazeer, and Quoc V . Le. Primer: Searching for efﬁcient transformers for language modeling. arXiv:2109.08668, 2021. [105] Emma Strubell, Ananya Ganesh, and Andrew McCallum. Energy and policy considerations for deep learning in NLP. arXiv:1906.02243, 2019. [106] Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. VL-BERT: Pre-training of generic visual-linguistic representations. arXiv:1908.08530, 2019. [107] Chen Sun, Austin Myers, Carl V ondrick, Kevin Murphy, and Cordelia Schmid. VideoBERT: A joint model for video and language representation learning. In International Conference on"},{"citing_arxiv_id":"2204.06745","ref_index":90,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"GPT-NeoX-20B: An Open-Source Autoregressive Language Model","primary_cat":"cs.CL","submitted_at":"2022-04-14T04:00:27+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GPT-NeoX-20B is a publicly released 20B parameter autoregressive language model trained on the Pile that shows strong gains in five-shot reasoning over similarly sized prior models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2202.08906","ref_index":71,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ST-MoE: Designing Stable and Transferable Sparse Expert Models","primary_cat":"cs.CL","submitted_at":"2022-02-17T21:39:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ST-MoE introduces stability techniques for sparse expert models, allowing a 269B-parameter model to achieve state-of-the-art transfer learning results across reasoning, summarization, and QA tasks at the compute cost of a 32B dense model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2112.04359","ref_index":267,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Ethical and social risks of harm from Language Models","primary_cat":"cs.CL","submitted_at":"2021-12-08T16:09:48+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The authors provide a detailed taxonomy of 21 risks associated with language models, covering discrimination, information leaks, misinformation, malicious applications, interaction harms, and societal impacts like job loss and environmental costs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2110.01552","ref_index":32,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Perhaps PTLMs Should Go to School -- A Task to Assess Open Book and Closed Book QA","primary_cat":"cs.CL","submitted_at":"2021-10-04T16:45:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Proposes a textbook-based true/false QA task where PTLMs score ~50% closed-book even after pre-training on the text and ~60% open-book with retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2107.06499","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deduplicating Training Data Makes Language Models Better","primary_cat":"cs.CL","submitted_at":"2021-07-14T06:06:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Deduplicating training datasets reduces language model verbatim memorization by 10x, improves training efficiency, and enables more accurate evaluation by cutting train-test overlap.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2101.03961","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity","primary_cat":"cs.LG","submitted_at":"2021-01-11T16:11:52+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Switch Transformers use top-1 expert routing in a Mixture of Experts setup to scale to trillion-parameter language models with constant compute and up to 4x speedup over T5-XXL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2009.14794","ref_index":149,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rethinking Attention with Performers","primary_cat":"cs.LG","submitted_at":"2020-09-30T17:09:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Performers approximate full-rank softmax attention in Transformers via FAVOR+ random features for linear complexity, with theoretical guarantees of unbiased estimation and competitive results on pixel, text, and protein tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1910.09700","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Quantifying the Carbon Emissions of Machine Learning","primary_cat":"cs.CY","submitted_at":"2019-10-21T23:57:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Presents a calculator tool for estimating carbon emissions from ML model training along with mitigation actions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1907.09881","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Convolutional Dictionary Learning in Hierarchical Networks","primary_cat":"cs.LG","submitted_at":"2019-07-23T13:57:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A hierarchical convolutional dictionary learning model for piecewise smooth signals using recursive scale-detail filtering and sparse coding, learned by alternating minimization and demonstrated on MNIST.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}