{"total":54,"items":[{"citing_arxiv_id":"2606.28057","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"MultiHashFormer: Hash-based Generative Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-26T13:03:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MultiHashFormer enables hash-based autoregression in LMs by encoding tokens as multi-hash signatures, outperforming standard Transformers at 100M-3B scales while keeping parameter count constant for multilingual expansion.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27275","ref_index":6,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"How Surprising Is Historical Italian to Language Models? Tokenization Tax, Comprehension Tax, and a Simple Mitigation","primary_cat":"cs.CL","submitted_at":"2026-06-25T16:52:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"17th-century Italian imposes a 2.4x surprisal tax on LLMs versus modern Italian with comparable tokenization costs to Russian, yet embeddings stay robust above 0.85 similarity and a temporal prompt reduces surprisal by 60%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27019","ref_index":16,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"MinGram: A Minimalist Unigram Tokenizer with High Compression and Competitive Morphological Alignment","primary_cat":"cs.CL","submitted_at":"2026-06-25T13:31:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MinGram is a simplified Unigram tokenizer training method that prioritizes token count minimization to deliver higher compression than BPE and standard Unigram while retaining competitive morphological alignment and superior bits-per-byte performance in language model training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24163","ref_index":13,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"CORE-BREW: LLR-Based Soft Decoding for Robust Multi-Bit LLM Watermarking","primary_cat":"cs.CR","submitted_at":"2026-06-23T05:37:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CORE-BREW introduces constant-hit-rate embedding to produce LLRs enabling soft-decision decoding for more robust multi-bit LLM watermarking with two FPR-aware detection modes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19626","ref_index":2,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Toten: A Knowledge-Based System For Structure-Preserving Representation Of Physical Quantities And Technical Notation In Brazilian Portuguese","primary_cat":"cs.AI","submitted_at":"2026-06-17T22:06:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TOTEN is a knowledge-based system for structure-preserving representation of physical quantities and technical notation in Brazilian Portuguese using an ontology of engineering entities and external authorities, outperforming statistical baselines in atomicity and reconstruction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02100","ref_index":24,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"PortBERT: Navigating the Depths of Portuguese Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-01T11:32:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"PortBERT releases two RoBERTa models for Portuguese that match or beat prior monolingual and multilingual models on translated GLUE/SuperGLUE tasks while reporting training and inference times.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22879","ref_index":13,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Budgeted Dynamic Trace Structures for Token-Efficient Sequential Computation","primary_cat":"cs.DC","submitted_at":"2026-05-20T17:43:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BDTS is a new data-structural framework for budgeted maintenance of rooted trace graphs, with Rust benchmarks showing compaction of 350k-2.71M tokens to 1k-4k tokens and model input reduction from ~3360 to ~432 tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15562","ref_index":30,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"GiLT: Augmenting Transformer Language Models with Dependency Graphs","primary_cat":"cs.CL","submitted_at":"2026-05-15T03:08:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GiLT augments Transformers with semantic dependency graphs by modulating attention to improve syntactic generalization while keeping perplexity competitive and enabling better finetuning on downstream tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09949","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Syntax to Semantics: Unveiling the Emergence of Chirality in SMILES Translation Models","primary_cat":"cs.LG","submitted_at":"2026-05-11T03:53:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Chirality emerges in SMILES translation models through an abrupt encoder-centered reorganization of representations after a long plateau, identified via checkpoint analysis and ablation.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"From Syntax to Semantics: Unveiling the Emergence of Chirality in SMILES Translation Models 2.2 Model Architecture To simultaneously achieve the robust featurization of SMILES and the high-fidelity generation of molecular se- quences-while tracking the dynamics of semantic constraint acquisition-we constructed Pan-CORE, an autoregres- sive Transformer-based encoder-decoder model. The overall architecture inherited Transformer-V AE[36] with the variational components removed to yield a standard autoencoder architecture.(Fig. 1a) The encoder featurizes the input token sequence via token embeddings and positional encodings, processing it through N Transformer blocks containing Self-Attention and Feed-Forward Networks (FFN). An aggregation layer then compresses this variable-length sequence"},{"citing_arxiv_id":"2605.09154","ref_index":30,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Predicting Large Model Test Losses with a Noisy Quadratic System","primary_cat":"cs.LG","submitted_at":"2026-05-09T20:35:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A noisy quadratic system predicts large model test losses from N, B, K and outperforms Chinchilla's model for extrapolation up to 1000x compute.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"NX n=1 (1−γλ n)2K P np + 1 2 NX n=1 λn R nr γ2 KX k=1 (1−γλ n)2(K−k) 1 B + 1 2 ∞X n=N+1 P np (28) =E irr + 1 2 NX n=1 (1−γ Q nq )2K P np + 1 2 NX n=1 Q nq R nr 1 B γ2 KX k=1 (1−γ Q nq )2(K−k) + 1 2 ∞X n=N+1 P np .(29) 20 Predicting Large Model Test Losses with a Noisy Quadratic System By re-parameterizingQ=:γQ, R=:γR/2, P=:P/2, we get: E[Q(w(K))] (30) =E irr + NX n=1 (1− Q nq )2K P np + NX n=1 QR nq+r 1 B KX k=1 (1− Q nq )2(K−k) + ∞X n=N+1 P np .(31) Other than N, B, K, this function has 7 input arguments: P, p, Q, q, R, r and Eirr. Thus, the model class LNQS(N, B, K) = E[Q(w(K))]has at most 7 degrees of freedom. End of proof. E.2. Weight Norm and Learning Rate Scheduling Learning Rate Schedule"},{"citing_arxiv_id":"2605.05773","ref_index":20,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"CircuitFormer: A Circuit Language Model for Analog Topology Design from Natural Language Prompt","primary_cat":"cs.AI","submitted_at":"2026-05-07T07:04:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CircuitFormer is a 511M-parameter encoder-decoder model that generates analog circuit topologies from text prompts at 100% syntactic correctness and 83% functional success using a new subcircuit-mining tokenizer that keeps vocabulary size fixed at 512.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04196","ref_index":15,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"The Impact of Vocabulary Overlaps on Knowledge Transfer in Multilingual Machine Translation","primary_cat":"cs.CL","submitted_at":"2026-05-05T18:39:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Experiments show domain match and language relatedness drive knowledge transfer in multilingual MT more than vocabulary overlap.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"subword-tokenizer algorithm, that creates the sub- words based on frequency and the vocabulary size. The algorithm itself was created already in the 1990's (Gage, 1994), and it has been used for NLP applications for at least a decade (Sennrich et al., 2016), though other algorithms, such as Unigram language models have been proposed and used as well (Kudo, 2018). As Kallini et al. (2025) suggest, given the vo- cabulariesV 1 andV 2 for languagesL 1 andL 2, respectively, the vocabulary overlap ofV 1 andV 2 can be presented asO=V 1 ∩V 2. The size of a joint vocabularyV joint forL 1 andL 2, therefore depends on the size of the vocabulary overlapO. More formally,|V joint|=|V 1|+|V 2|−|O|. How- ever, this is true for a disjoint vocabulary as well,"},{"citing_arxiv_id":"2604.25486","ref_index":11,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"ReTokSync: Self-Synchronizing Tokenization Disambiguation for Generative Linguistic Steganography","primary_cat":"cs.CR","submitted_at":"2026-04-28T10:42:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ReTokSync resolves tokenization ambiguity in generative linguistic steganography via targeted self-synchronizing resets, achieving over 99.7% extraction accuracy and 100% recovery with an auxiliary channel while matching baseline security and quality.","context_count":1,"top_context_role":"other","top_context_polarity":"unclear","context_text":"1145/3460120.3484550 [10] Taku Kudo. 2018. Subword Regularization: Improving Neural Network Trans- lation Models with Multiple Subword Candidates. InProceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). Association for Computational Linguistics, Melbourne, Australia, 66-75. doi:10.18653/v1/P18-1007 [11] Taku Kudo and John Richardson. 2018. SentencePiece: A Simple and Language Independent Subword Tokenizer and Detokenizer for Neural Text Processing. InProceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Association for Computational Linguistics, Brussels, Belgium, 66-71. doi:10.18653/v1/D18-2012"},{"citing_arxiv_id":"2604.18563","ref_index":61,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Dual Alignment Between Language Model Layers and Human Sentence Processing","primary_cat":"cs.CL","submitted_at":"2026-04-20T17:51:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Later LLM layers align better with human cognitive effort in syntactic ambiguity than early layers do, indicating dual processing modes and complementary benefits from multi-layer probability updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17814","ref_index":43,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Understanding Secret Leakage Risks in Code LLMs: A Tokenization Perspective","primary_cat":"cs.CR","submitted_at":"2026-04-20T05:12:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BPE tokenization creates gibberish bias in CLLMs, causing secrets with high character entropy but low token entropy to be preferentially memorized due to training data distribution shifts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17105","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"How Tokenization Limits Phonological Knowledge Representation in Language Models and How to Improve Them","primary_cat":"cs.CL","submitted_at":"2026-04-18T18:40:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Subword tokenization impairs phonological knowledge encoding in LMs, but an IPA-based fine-tuning method restores it with minimal impact on other capabilities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11575","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MIXAR: Scaling Autoregressive Pixel-based Language Models to Multiple Languages and Scripts","primary_cat":"cs.CL","submitted_at":"2026-04-13T14:53:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MIXAR is the first autoregressive pixel-based language model for eight languages and scripts, with empirical gains on multilingual tasks, robustness to unseen languages, and further improvements when scaled to 0.5B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06863","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Digital Skin, Digital Bias: Uncovering Tone-Based Biases in LLMs and Emoji Embeddings","primary_cat":"cs.SI","submitted_at":"2026-04-08T09:24:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLMs handle skin tone emoji modifiers better than dedicated embedding models but display systemic disparities in sentiment and semantic consistency across tones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.02667","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unifying Contrastive and Generative Objectives for Visual Understanding and Text-to-Image Generation","primary_cat":"cs.CV","submitted_at":"2026-03-03T06:54:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DREAM introduces Masking Warmup and Semantically Aligned Decoding to let a single encoder handle both contrastive alignment and masked generation, yielding gains over CLIP and FLUID on understanding and generation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.16378","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hearing to Translate: The Effectiveness of Speech Modality Integration into LLMs","primary_cat":"cs.CL","submitted_at":"2025-12-18T10:21:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Cascaded systems remain the most reliable for speech translation overall, but recent SpeechLLMs match or outperform them in many conditions while standalone speech models lag.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.18091","ref_index":11,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Accelerating Vision Transformers with Adaptive Patch Sizes","primary_cat":"cs.CV","submitted_at":"2025-10-20T20:37:11+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"APT adaptively varies patch sizes within a single image to reduce ViT token count, delivering 40-50% throughput gains on large models with no downstream performance loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.04166","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi Language Models for On-the-Fly Syntax Highlighting","primary_cat":"cs.SE","submitted_at":"2025-10-05T11:48:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Unified multi-language deep learning model for on-the-fly syntax highlighting using normalization and few-shot learning to support six languages with lower deployment cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.15229","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VocabTailor: Dynamic Vocabulary Selection for Downstream Tasks in Small Language Models","primary_cat":"cs.CL","submitted_at":"2025-08-21T04:32:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VocabTailor introduces a decoupled dynamic vocabulary selection framework that reduces vocabulary-related memory in SLMs by up to 99% with minimal task performance loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.02574","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLM-Safety Evaluations Lack Robustness","primary_cat":"cs.CR","submitted_at":"2025-03-04T12:55:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LLM safety evaluations are hindered by noise in dataset curation, automated red-teaming, response generation, and LLM-judge evaluation, making fair comparisons difficult and slowing progress.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2411.05527","ref_index":30,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"How Good is Your Wikipedia? Auditing Data Quality for Low-resource and Multilingual NLP","primary_cat":"cs.CL","submitted_at":"2024-11-08T12:35:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The study filters non-English Wikipedia, reveals quality problems, proposes a 4-level ranking, and shows filtered data matches or beats raw data in language modeling with largest gains for lower-quality editions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.00037","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Moshi: a speech-text foundation model for real-time dialogue","primary_cat":"eess.AS","submitted_at":"2024-09-17T17:55:39+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Moshi is the first real-time full-duplex spoken large language model that casts dialogue as speech-to-speech generation using parallel audio streams and an inner monologue of time-aligned text tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.00118","ref_index":112,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Gemma 2: Improving Open Language Models at a Practical Size","primary_cat":"cs.CL","submitted_at":"2024-07-31T19:13:07+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Gemma 2 models achieve leading performance at their sizes by combining established Transformer modifications with knowledge distillation for the 2B and 9B variants.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2407.07726","ref_index":68,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"PaliGemma: A versatile 3B VLM for transfer","primary_cat":"cs.CV","submitted_at":"2024-07-10T14:57:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"PaliGemma is an open 3B VLM based on SigLIP and Gemma that achieves strong performance on nearly 40 diverse open-world tasks including benchmarks, remote-sensing, and segmentation.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Moe-llava: Mixture of ex- perts for large vision-language models, 2024. URL https://arxiv.org/abs/ 2401.15947. [67] T. Lin, M. Maire, S. J. Belongie, L. D. Bour- dev, R. B. Girshick, J. Hays, P. Perona, D. Ramanan, P. Doll'a r, and C. L. Zitnick. Microsoft COCO: common objects in con- text. CoRR, abs/1405.0312, 2014. URL http://arxiv.org/abs/1405.0312. [68] F. Liu, E. Bugliarello, E. M. Ponti, S. Reddy, N. Collier, and D. Elliott. Visually grounded reasoning across languages and cultures. In M.-F. Moens, X. Huang, L. Spe- cia, and S. W.-t. Yih, editors, Proceed- ings of the 2021 Conference on Empirical Methods in Natural Language Processing, pages 10467-10485, Online and Punta Cana, Dominican Republic, Nov."},{"citing_arxiv_id":"2406.07887","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"An Empirical Study of Mamba-based Language Models","primary_cat":"cs.LG","submitted_at":"2024-06-12T05:25:15+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An 8B Mamba-2-Hybrid with 43% Mamba-2, 7% attention, and 50% MLP layers exceeds an 8B Transformer by 2.65 points on average across 12 tasks and matches it on 23 long-context tasks while enabling up to 8x faster inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2405.09818","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Chameleon: Mixed-Modal Early-Fusion Foundation Models","primary_cat":"cs.CL","submitted_at":"2024-05-16T05:23:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Chameleon is an early-fusion token model that handles mixed image-text sequences for understanding and generation, achieving competitive or superior performance to larger models like Llama-2, Mixtral, and Gemini-Pro on captioning, VQA, text, and image tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.08295","ref_index":100,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Gemma: Open Models Based on Gemini Research and Technology","primary_cat":"cs.CL","submitted_at":"2024-03-13T06:59:16+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Gemma introduces open 2B and 7B LLMs derived from Gemini technology that beat comparable open models on 11 of 18 text tasks and come with safety assessments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.04652","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Yi: Open Foundation Models by 01.AI","primary_cat":"cs.CL","submitted_at":"2024-03-07T16:52:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Yi models are 6B and 34B open foundation models pretrained on 3.1T curated tokens that achieve strong benchmark results through data quality and targeted extensions like long context and vision alignment.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Skywork 7B 55.8 7.8 13.4 22.8 Yi 6B 32.5 4.6 15.9 26.3 34B 67.2 14.4 23.2 41.0 Table 3: Comparison of models on GSM8k, MATH, Human-Eval, and MBPP. Code: We report the average pass@1 scores of our models on HumanEval[7] (Chen et al., 2021) and MBPP[2] (Austin et al., 2021). Popular Aggregated Benchmark: We report the overall results for MMLU[ 27](5-shot), CMMLU[42] (5-shot), Gaokao-Bench[90] (5-shot), and BigBench[72] Hard (BBH[74]) (3-shot). By training on a significantly larger number of tokens (3.1T) compared to prior work (usually ≤ 2T), we have observed a substantial performance gain across benchmarks, as shown in Table 2. However, it is important to note that there are still discernible disparities between our model and existing"},{"citing_arxiv_id":"2312.16886","ref_index":63,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MobileVLM : A Fast, Strong and Open Vision Language Assistant for Mobile Devices","primary_cat":"cs.CV","submitted_at":"2023-12-28T08:21:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MobileVLM achieves on-par performance with much larger vision-language models on standard benchmarks while delivering state-of-the-art inference speeds of 21.5 tokens per second on Snapdragon 888 CPU and 65.3 on Jetson Orin GPU.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"models has rapidly emerged. Table 1 summarizes them in a detailed comparison regarding architectures, cross- modality design, and training corpora. Architecture choices. As a consequence of the intim- idating training cost of large language models, most lan- guage models used in VLMs are pre-trained open-source models like OPT [131], Flan-T5 [26], Chinchilla [63], Vi- cuna [118] and LLaMA [115]. QWen adapts LLaMA with custom variations [4] to obtain an LLM on their own. Visual backbones in VLMs are typically vision trans- former [34], but pre-trained in various strategies [37,66,97]. Most VLMs prefer CLIP-fashioned ViT [97] trained with natural language supervision. Flamingo picks NFNet-F6 [59]. KOSMOS chooses VLMo [6] instead."},{"citing_arxiv_id":"2312.11805","ref_index":47,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Gemini: A Family of Highly Capable Multimodal Models","primary_cat":"cs.CL","submitted_at":"2023-12-19T02:39:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Gemini Ultra reaches human-expert performance on MMLU for the first time and sets new state-of-the-art results on 30 of 32 benchmarks, including all 20 multimodal ones tested.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.17591","ref_index":15,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Lil-Bevo: Explorations of Strategies for Training Language Models in More Humanlike Ways","primary_cat":"cs.CL","submitted_at":"2023-10-26T17:13:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Lil-Bevo applies music pretraining, curriculum learning on sequence length, and targeted masking to small LMs in the BabyLM challenge, finding modest gains from short sequences but overall limited performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.10305","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Baichuan 2: Open Large-scale Language Models","primary_cat":"cs.CL","submitted_at":"2023-09-19T04:13:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Baichuan 2 presents 7B and 13B LLMs trained on 2.6T tokens that match or exceed similar open models on MMLU, CMMLU, GSM8K, HumanEval and excel in medicine and law.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2306.15595","ref_index":11,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Extending Context Window of Large Language Models via Positional Interpolation","primary_cat":"cs.CL","submitted_at":"2023-06-27T16:26:26+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Position Interpolation linearly down-scales position indices to extend RoPE context windows to 32768 tokens with 1000-step fine-tuning, delivering strong long-context results on LLaMA 7B-65B while preserving short-context quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.10403","ref_index":261,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"PaLM 2 Technical Report","primary_cat":"cs.CL","submitted_at":"2023-05-17T17:46:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PaLM 2 reports state-of-the-art results on language, reasoning, and multilingual tasks with improved efficiency over PaLM.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2303.17564","ref_index":55,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"BloombergGPT: A Large Language Model for Finance","primary_cat":"cs.LG","submitted_at":"2023-03-30T17:30:36+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BloombergGPT is a 50B parameter LLM trained on a 708B token mixed financial and general dataset that outperforms prior models on financial benchmarks while preserving general LLM performance.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"06 characters per token), indicating an above-average amount of markup, which suggests that further cleaning might benefit future model training. 2.3 Tokenization We choose the Unigram tokenizer (Kudo, 2018) instead of a greedy merge-based sub-word tokenizer, such as Byte Pair Encoding (BPE) (Sennrich et al., 2016) or Wordpiece (Schuster and Nakajima, 2012; Wu et al., 2016), based on promising results in Kudo and Richardson (2018) and Bostrom and Durrett (2020). Following GPT-2 (Radford et al., 2019), we treat our data as a sequence of bytes rather than Unicode characters, and we include each of the 256 bytes as tokens. In a pretokenization step, the input byte sequence is broken into chunks by greedily matching the following regular expression: [ A-Za-z]+|[0-9]|[^A-Za-z0-9]+ ."},{"citing_arxiv_id":"2211.15089","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Continuous diffusion for categorical data","primary_cat":"cs.CL","submitted_at":"2022-11-28T06:08:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper proposes CDCD, a continuous-time and continuous-space diffusion framework for categorical data, and reports results on language modeling tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2211.05100","ref_index":257,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"BLOOM: A 176B-Parameter Open-Access Multilingual Language Model","primary_cat":"cs.CL","submitted_at":"2022-11-09T18:48:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BLOOM is a 176B-parameter open-access multilingual language model trained on the ROOTS corpus that achieves competitive performance on benchmarks, with improved results after multitask prompted finetuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2206.10789","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Scaling Autoregressive Models for Content-Rich Text-to-Image Generation","primary_cat":"cs.CV","submitted_at":"2022-06-22T01:11:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Scaling an autoregressive Transformer to 20B parameters for text-to-image generation using image token sequences achieves new SOTA zero-shot FID of 7.23 and fine-tuned FID of 3.22 on MS-COCO.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"[31] Jiahui Yu, Yuchen Fan, Jianchao Yang, Ning Xu, Zhaowen Wang, Xinchao Wang, and Thomas Huang. Wide activation for efﬁcient and accurate image super-resolution. arXiv preprint arXiv:1808.08718, 2018. 30 [32] Rico Sennrich, Barry Haddow, and Alexandra Birch. Neural machine translation of rare words with subword units. arXiv preprint arXiv:1508.07909, 2015. [33] Taku Kudo and John Richardson. Sentencepiece: A simple and language independent subword tokenizer and detokenizer for neural text processing. arXiv preprint arXiv:1808.06226, 2018. [34] Rewon Child, Scott Gray, Alec Radford, and Ilya Sutskever. Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509, 2019. [35] Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena,"},{"citing_arxiv_id":"2205.01068","ref_index":245,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"OPT: Open Pre-trained Transformer Language Models","primary_cat":"cs.CL","submitted_at":"2022-05-02T17:49:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPT releases open decoder-only transformers up to 175B parameters that match GPT-3 performance at one-seventh the carbon cost, along with code and training logs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2204.02311","ref_index":74,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"PaLM: Scaling Language Modeling with Pathways","primary_cat":"cs.CL","submitted_at":"2022-04-05T16:11:45+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PaLM 540B demonstrates continued scaling benefits by setting new few-shot SOTA results on hundreds of benchmarks and outperforming humans on BIG-bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2202.08906","ref_index":163,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ST-MoE: Designing Stable and Transferable Sparse Expert Models","primary_cat":"cs.CL","submitted_at":"2022-02-17T21:39:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ST-MoE introduces stability techniques for sparse expert models, allowing a 269B-parameter model to achieve state-of-the-art transfer learning results across reasoning, summarization, and QA tasks at the compute cost of a 32B dense model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2112.04426","ref_index":119,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Improving language models by retrieving from trillions of tokens","primary_cat":"cs.CL","submitted_at":"2021-12-08T17:32:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RETRO matches GPT-3 and Jurassic-1 performance on the Pile benchmark using 25 times fewer parameters by conditioning on retrieved chunks from a 2-trillion-token database.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2109.00859","ref_index":71,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation","primary_cat":"cs.CL","submitted_at":"2021-09-02T12:21:06+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CodeT5 adds identifier-aware pre-training and bimodal dual generation to a T5-style encoder-decoder, yielding better results on defect detection, clone detection, and code-to-text, text-to-code, and code-to-code tasks than prior encoder-only or decoder-only models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2108.07732","ref_index":4,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"Program Synthesis with Large Language Models","primary_cat":"cs.PL","submitted_at":"2021-08-16T03:57:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Large language models synthesize Python code from descriptions with log-linear scaling in performance, reaching 59.6% on MBPP via few-shot prompting and 83.8% on MathQA-Python after fine-tuning, while human feedback halves error rates but models fail at predicting program outputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2104.08691","ref_index":83,"ref_count":1,"confidence":0.88,"is_internal_anchor":true,"paper_title":"The Power of Scale for Parameter-Efficient Prompt Tuning","primary_cat":"cs.CL","submitted_at":"2021-04-18T03:19:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Prompt tuning matches full model tuning performance on large language models while tuning only a small fraction of parameters and improves robustness to domain shifts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2009.14794","ref_index":135,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rethinking Attention with Performers","primary_cat":"cs.LG","submitted_at":"2020-09-30T17:09:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Performers approximate full-rank softmax attention in Transformers via FAVOR+ random features for linear complexity, with theoretical guarantees of unbiased estimation and competitive results on pixel, text, and protein tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}