{"total":25,"items":[{"citing_arxiv_id":"2605.22821","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Tokenisation via Convex Relaxations","primary_cat":"cs.CL","submitted_at":"2026-05-21T17:59:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ConvexTok uses convex relaxation of tokenization to a linear program, improving intrinsic metrics, bits-per-byte, and some downstream tasks while certifying near-optimality within 1% at typical vocabulary sizes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22879","ref_index":23,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Budgeted Dynamic Trace Structures for Token-Efficient Sequential Computation","primary_cat":"cs.DC","submitted_at":"2026-05-20T17:43:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BDTS is a new data-structural framework for budgeted maintenance of rooted trace graphs, with Rust benchmarks showing compaction of 350k-2.71M tokens to 1k-4k tokens and model input reduction from ~3360 to ~432 tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17398","ref_index":41,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"MiniGPT: Rebuilding GPT from First Principles","primary_cat":"cs.CL","submitted_at":"2026-05-17T11:32:07+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"MiniGPT is a self-contained PyTorch implementation of standard GPT autoregressive modeling that reaches 1.478 validation loss on Tiny Shakespeare with a 10.77M-parameter model and produces recognizable Shakespeare-style text.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17379","ref_index":14,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Learning Faster with Better Tokens: Parameter-Efficient Vocabulary Adaptation for Specialized Text Summarization","primary_cat":"cs.CL","submitted_at":"2026-05-17T10:45:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Vocabulary adaptation via targeted token addition and replacement improves semantic similarity, domain word usage, and training efficiency for LLM summarization in legal and medical domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13429","ref_index":112,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"TokAlign++: Advancing Vocabulary Adaptation via Better Token Alignment","primary_cat":"cs.CL","submitted_at":"2026-05-13T12:23:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TokAlign++ learns token alignments between LLM vocabularies from monolingual representations to enable faster adaptation, better text compression, and effective token-level distillation across 15 languages with minimal steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05773","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CircuitFormer: A Circuit Language Model for Analog Topology Design from Natural Language Prompt","primary_cat":"cs.AI","submitted_at":"2026-05-07T07:04:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CircuitFormer is a 511M-parameter encoder-decoder model that generates analog circuit topologies from text prompts at 100% syntactic correctness and 83% functional success using a new subcircuit-mining tokenizer that keeps vocabulary size fixed at 512.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04196","ref_index":13,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"The Impact of Vocabulary Overlaps on Knowledge Transfer in Multilingual Machine Translation","primary_cat":"cs.CL","submitted_at":"2026-05-05T18:39:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Experiments show domain match and language relatedness drive knowledge transfer in multilingual MT more than vocabulary overlap.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04107","ref_index":22,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"TSCG: Deterministic Tool-Schema Compilation for Agentic LLM Deployments","primary_cat":"cs.SE","submitted_at":"2026-05-04T15:35:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TSCG compiles JSON tool schemas into token-efficient structured text, raising tool-use accuracy for small LLMs from 0% to 84.4% on benchmarks while cutting tokens by 52-57%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01188","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Compute Optimal Tokenization","primary_cat":"cs.CL","submitted_at":"2026-05-02T01:53:22+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17814","ref_index":42,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Understanding Secret Leakage Risks in Code LLMs: A Tokenization Perspective","primary_cat":"cs.CR","submitted_at":"2026-04-20T05:12:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BPE tokenization creates gibberish bias in CLLMs, causing secrets with high character entropy but low token entropy to be preferentially memorized due to training data distribution shifts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08290","ref_index":4,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Tokalator: A Context Engineering Toolkit for Artificial Intelligence Coding Assistants","primary_cat":"cs.SE","submitted_at":"2026-04-09T14:27:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Tokalator is a toolkit with VS Code extension, calculators, and community resources to monitor and optimize token usage in AI coding environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.18091","ref_index":14,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Accelerating Vision Transformers with Adaptive Patch Sizes","primary_cat":"cs.CV","submitted_at":"2025-10-20T20:37:11+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"APT adaptively varies patch sizes within a single image to reduce ViT token count, delivering 40-50% throughput gains on large models with no downstream performance loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.25826","ref_index":9,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Kairos: Toward Adaptive and Parameter-Efficient Time Series Foundation Models","primary_cat":"cs.LG","submitted_at":"2025-09-30T06:02:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Kairos is a parameter-efficient time series foundation model using dynamic patching tokenizer, mixture-of-size encoding, and spectral-conditioned positional embeddings to improve zero-shot forecasting on heterogeneous data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.12720","ref_index":4,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"FLEXITOKENS: Flexible Tokenization for Evolving Language Models","primary_cat":"cs.CL","submitted_at":"2025-07-17T01:55:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FLEXITOKENS replaces rigid subword tokenizers and fixed-compression auxiliary losses with a simplified boundary-prediction objective in byte-level models, yielding lower over-fragmentation and up to 10-point gains on multilingual and domain-adaptation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.02974","ref_index":87,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"InvisibleInk: High-Utility and Low-Cost Text Generation with Differential Privacy","primary_cat":"cs.LG","submitted_at":"2025-06-30T18:00:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InvisibleInk achieves high-utility differentially private long-form LLM text generation at 4-8x the cost of non-private generation by isolating and clipping sensitive logits and sampling from a small superset of top-k private tokens without privacy cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.14123","ref_index":61,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Sampling from Your Language Model One Byte at a Time","primary_cat":"cs.CL","submitted_at":"2025-06-17T02:37:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"An inference-time technique turns BPE-based LMs into byte- or character-level models, solving the prompt boundary problem while unifying vocabularies across different tokenizers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.05171","ref_index":137,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Scaling up Test-Time Compute with Latent Reasoning: A Recurrent Depth Approach","primary_cat":"cs.LG","submitted_at":"2025-02-07T18:55:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A recurrent-depth architecture enables language models to improve reasoning performance by iterating computation in latent space, achieving gains equivalent to much larger models on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.04155","ref_index":31,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Toxic Subword Pruning for Dialogue Response Generation on Large Language Models","primary_cat":"cs.CL","submitted_at":"2024-10-05T13:30:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ToxPrune prunes toxic subwords from BPE tokenizers in LLMs to mitigate toxic dialogue responses and improve diversity on both toxic and non-toxic models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.06395","ref_index":36,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies","primary_cat":"cs.CL","submitted_at":"2024-04-09T15:36:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MiniCPM 1.2B and 2.4B models reach parity with 7B-13B LLMs via model wind-tunnel scaling and a WSD scheduler that yields a higher optimal data-to-model ratio than Chinchilla scaling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.06066","ref_index":45,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models","primary_cat":"cs.CL","submitted_at":"2024-01-11T17:31:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DeepSeekMoE 2B matches GShard 2.9B performance and approaches a dense 2B model; the 16B version matches LLaMA2-7B at 40% compute by using fine-grained expert segmentation plus shared experts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2303.17564","ref_index":99,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"BloombergGPT: A Large Language Model for Finance","primary_cat":"cs.LG","submitted_at":"2023-03-30T17:30:36+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BloombergGPT is a 50B parameter LLM trained on a 708B token mixed financial and general dataset that outperforms prior models on financial benchmarks while preserving general LLM performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2205.01068","ref_index":263,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"OPT: Open Pre-trained Transformer Language Models","primary_cat":"cs.CL","submitted_at":"2022-05-02T17:49:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPT releases open decoder-only transformers up to 175B parameters that match GPT-3 performance at one-seventh the carbon cost, along with code and training logs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2111.09543","ref_index":17,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing","primary_cat":"cs.CL","submitted_at":"2021-11-18T06:48:00+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeBERTaV3 improves DeBERTa by switching to replaced token detection pre-training and using gradient-disentangled embedding sharing, reaching 91.37% on GLUE and new SOTA on XNLI zero-shot.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2109.00859","ref_index":72,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation","primary_cat":"cs.CL","submitted_at":"2021-09-02T12:21:06+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CodeT5 adds identifier-aware pre-training and bimodal dual generation to a T5-style encoder-decoder, yielding better results on defect detection, clone detection, and code-to-text, text-to-code, and code-to-code tasks than prior encoder-only or decoder-only models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1906.11943","ref_index":31,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Findings of the First Shared Task on Machine Translation Robustness","primary_cat":"cs.CL","submitted_at":"2019-06-27T20:24:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The first shared task on MT robustness received 23 submissions showing up to +22.33 BLEU gains on noisy Reddit data, with strong human-BLEU correlation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}