{"total":24,"items":[{"citing_arxiv_id":"2607.01127","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"$\\text{Log}_\\text{b}$Quant: Quantizing Language Models in Logarithmic Space","primary_cat":"cs.CL","submitted_at":"2026-07-01T16:13:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Log_b Quant is an adjustable-base logarithmic quantization technique that outperforms tensor-wise asymmetric linear quantization at 4-bit precision on language model benchmarks while providing memory savings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13300","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Quantizing Time-Series Models As Dynamical Systems: Trajectory-Based Quantization Sensitivity Score","primary_cat":"cs.LG","submitted_at":"2026-06-11T12:53:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces TQS metric and TQS-PTQ framework that uses dynamical-systems stability to enable a priori, calibration-free mixed-precision post-training quantization for time-series models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10458","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Minimum Distortion Quantization with Specified Output Distribution","primary_cat":"cs.IT","submitted_at":"2026-06-09T06:06:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Derives optimal quantizer form X=σ(F^{-1}(F_W(W))) with permutation σ minimizing MMSE under specified output distribution P_X, using majorization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08891","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PALUTE: Processing-In-Memory Acceleration via Lookup Table for Edge LLM Inference","primary_cat":"cs.AR","submitted_at":"2026-06-08T00:33:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PALUTE is a new PIM accelerator using in-DRAM LUTs on M3D DRAM that reports 1264 TPS at 0.16 W with 12.8x energy efficiency gains over CHIME for quantized edge LLM inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29537","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Complexity of Verifying Feedforward Neural Networks in Quantised Settings","primary_cat":"cs.CC","submitted_at":"2026-05-28T07:52:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Verification of fixed-precision quantized FNNs is NP-complete under both LP and BV specifications, matching the rational case, while dynamic quantization with BV specs has established upper bounds complementing known PSPACE-hardness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27937","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Machine learning enables experimental access to photon-by-photon arrival times in scintillation detectors","primary_cat":"physics.ins-det","submitted_at":"2026-05-27T04:15:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Deep learning extracts photon-by-photon arrival times from scintillation detector waveforms using unsupervised training with a physically informed model, enabling improved timing resolution and photon classification in experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19645","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"K-Quantization and its Impact on Output Performance","primary_cat":"cs.CL","submitted_at":"2026-05-19T10:31:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Empirical evaluation of quantization effects on eight LLMs across bit widths, showing performance generally declines at lower precision but with model-size-dependent resilience and acceptable accuracy at 2 bits for many cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19195","ref_index":77,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Thermodynamic Costs of Simple Linear Regression","primary_cat":"cond-mat.stat-mech","submitted_at":"2026-05-18T23:51:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Thermodynamic lower bounds are approximated for exact and SGD linear regression, producing energy-aware scaling laws for optimal training dataset size given a target generalization error.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18474","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Prompt2Fingerprint: Plug-and-Play LLM Fingerprinting via Text-to-Weight Generation","primary_cat":"cs.CR","submitted_at":"2026-05-18T14:30:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"P2F generates low-rank parameter increments for LLM fingerprinting directly from textual descriptions in a single forward pass.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17160","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When Bits Break Recourse: Counterfactual-Faithful Quantization","primary_cat":"cs.LG","submitted_at":"2026-05-16T21:19:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CFQ trains quantizer parameters and mixed-precision allocation to preserve counterfactual recourse validity, cost, and direction on Adult, German Credit, and COMPAS while matching accuracy of standard quantizers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15551","ref_index":297,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Characterizing Learning in Deep Neural Networks using Tractable Algorithmic Complexity Analysis","primary_cat":"cs.LG","submitted_at":"2026-05-15T02:44:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QuBD extends algorithmic complexity estimation to quantized DNN weights, revealing that complexity decreases during learning, increases with overfitting, follows grokking patterns, and correlates with generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13699","ref_index":106,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Memristor Technologies for Dynamic Vision Sensors: A Critical Assessment and Research Roadmap","primary_cat":"cs.AR","submitted_at":"2026-05-13T15:51:05+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A structured review concludes that end-to-end DVS-memristor integration for analog in-memory event-driven computing remains an open challenge at TRL 2-5, with half of surveyed applications resting on projections rather than demonstrations.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"on similar-complexity inference tasks, and the measured performance of digital neuromorphic processors (Loihi 2, SynSense Speck) that already process DVS event streams today. Within the DVS-adjacent literature, the HfO2-based128×641T1R crossbar of [48] achieves 99.8% device yield and 5-8-bit equivalent analog precision per cell, sufficient for convolutional inference under post-training quantization [106]; on-chip memristor training has been demon- strated in a fabricated 30-device prototype at 97% classification accuracy under realistic device variations [107]. These DVS-adjacent demonstrations remain orders of magnitude smaller than Rao 2023 or NeuRRAM in cell count, have not been tested on event-camera data, and require DAC conversion of the event stream."},{"citing_arxiv_id":"2605.12046","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rethink the Role of Neural Decoders in Quantum Error Correction","primary_cat":"quant-ph","submitted_at":"2026-05-12T12:26:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Neural decoders for surface-code QEC achieve practical microsecond FPGA latency when trained on large datasets with appropriate inductive biases and INT4 quantization, rather than relying on architectural complexity.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"our MWPM baseline represents a strong classical reference that leverages the full noise model information available from the quantum circuit. 23 Rethink the Role of Neural Decoders in Quantum Error Correction Evaluation metric.We report the logical error rate (LER), defined as the fraction of decoding trials in which the decoder incorrectly predicts the logical observable flip: LER = 1 N NX i=1 1[ˆyi ̸=y i],(12) where N is the number of test samples, yi ∈ {0,1} is the ground-truth logical flip for sample i, and ˆyi is the decoder's prediction. Lower LER indicates better decoding performance. D.2. Quantization and Pruning Table 12 summarizes the training hyperparameters for quantization and pruning experiments. Quantization experiments are conducted on 3D-CNN, TCN, and Transformer; MLP and GNN do not have quantized implementations."},{"citing_arxiv_id":"2605.10886","ref_index":34,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LoKA: Low-precision Kernel Applications for Recommendation Models At Scale","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:32:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LoKA enables practical FP8 use in numerically sensitive large recommendation models via online profiling of activations, reusable model modifications for stability, and dynamic kernel dispatching.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Per-update (single minibatch) estimates: SolveL V LT V =V+εI, fW=W c L−T V , U ′ = 1 N fWfW T ,(M×M) SolveL U LT U =U+εI, cW=L −1 U Wc, V ′ = 1 M cW TcW ,(N×N) EMA smoothing: U ′′ =m U+ (1−m)U ′, V ′′ =m V+ (1−m)V ′, U← 1 2(U ′′ +U ′′T ) +εI, V← 1 2(V ′′ +V ′′T ) +εI. Scale identifiability.Because V⊗U is invariant under (U, V)7→(cU, V /c) (where ⊗ is the Kronecker product [34], we renormalize to prevent drift using s for better numerical stability: s= trace(U) M , U← U s , V←s V. These updates allow us to avoid forming V −1 or U −1 explicitly. In practice we use small ε (e.g., 10−6 × trace(U) M ) and a momentumm∈[0.9,0.99]for stable online tracking. 5 To minimize overhead, LoKA Probe activates every 100 training iterations and asynchronously saves statistical parame-"},{"citing_arxiv_id":"2605.06485","ref_index":15,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Litespark Inference For CPUs: Ultra-Fast SIMD Framework for Ternary (1.58-bit) Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-07T16:07:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Litespark-Inference delivers custom SIMD kernels for ternary LLMs achieving up to 95.81x throughput versus PyTorch on CPUs by using integer addition/subtraction instead of floating-point math.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10841","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Harnessing Photonics for Machine Intelligence","primary_cat":"physics.optics","submitted_at":"2026-04-12T22:23:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Photonic computing can reshape AI acceleration through optical bandwidth and parallelism, but requires cross-layer co-design and electronic-photonic design automation to move from prototypes to scalable systems.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"This pivot harnesses the intrinsic properties of light, such as high bandwidth, low latency, and massive parallelism, to perform efficient linear transformations, a capability that aligns precisely with the workload of modern deep learning. Since foundation models are dominated by dense Matrix-Vector Multiplication (MVM) yet exhibit remarkable tolerance to low-precision computations [ 24, 25], they are ideally suited to the analog domain. This algorithmic robustness allows optical cores to serve ashigh-throughput, specialized primitives for the next generation of machine intelligence. Driven by this promise, recent years have witnessed rapid progress in optical computing prototypes, particularly photonic integrated circuit (PIC)-based optical neural networks (ONNs)."},{"citing_arxiv_id":"2604.08118","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Initialisation Determines the Basin: Efficient Codebook Optimisation for Extreme LLM Quantization","primary_cat":"cs.CL","submitted_at":"2026-04-09T11:38:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Output-aware EM initialization for codebooks in additive quantization avoids poor optimization basins and yields better 2-bit compressed LLMs across Llama and Qwen models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.26603","ref_index":59,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Sustainability Is Not Linear: Quantifying Performance, Energy, and Privacy Trade-offs in On-Device Intelligence","primary_cat":"cs.SE","submitted_at":"2026-03-27T17:00:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Empirical case study on a flagship Android device profiles energy, latency, and quality trade-offs across eight LLMs, revealing a quantization energy paradox and identifying mid-sized models as practical sweet spots.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.06516","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"You Had One Job: Per-Task Quantization Using LLMs' Hidden Representations","primary_cat":"cs.CL","submitted_at":"2025-11-09T19:58:24+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.03472","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DPQuant: Efficient and Differentially-Private Model Training via Dynamic Quantization Scheduling","primary_cat":"cs.LG","submitted_at":"2025-09-03T16:51:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DPQuant uses epoch-wise probabilistic layer rotation and DP loss sensitivity to quantize only a changing subset of layers, reducing accuracy degradation from quantization noise in DP-SGD and delivering up to 2.21x throughput gains with under 2% accuracy drop.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.18553","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Geometry of LLM Quantization: GPTQ as Babai's Nearest Plane Algorithm","primary_cat":"cs.LG","submitted_at":"2025-07-24T16:22:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"GPTQ is equivalent to Babai's nearest plane algorithm for CVP on the Hessian lattice of layer inputs, yielding geometric interpretation, inherited error bounds, and improved clipping-free quantization with GPU kernels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2306.00978","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration","primary_cat":"cs.CL","submitted_at":"2023-06-01T17:59:10+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AWQ quantizes LLM weights to low bits by scaling salient channels based on activation statistics, outperforming prior methods on language, coding, math, and multi-modal benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2210.17323","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers","primary_cat":"cs.LG","submitted_at":"2022-10-31T13:42:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GPTQ quantizes 175B-parameter GPT models to 3-4 bits per weight in one shot using approximate second-order information, achieving negligible accuracy degradation and 3-4x inference speedups.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2208.07339","ref_index":135,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale","primary_cat":"cs.LG","submitted_at":"2022-08-15T17:08:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLM.int8() performs 8-bit inference for transformers up to 175B parameters with no accuracy loss by combining vector-wise quantization for most features with 16-bit mixed-precision handling of systematic outlier dimensions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}