{"total":60,"items":[{"citing_arxiv_id":"2605.22606","ref_index":5,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Missing Links in Public Email and Covert Networks: A Comparative Evaluation of Link Prediction, Hyperlink Prediction, and ERGM Estimation","primary_cat":"cs.SI","submitted_at":"2026-05-21T15:19:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Comparative evaluation shows link prediction strong for dyadic recovery while hyperlink prediction with CHESHIRE gains on higher-order structures under a shared masking protocol.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19323","ref_index":12,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Correlation between baryonic process and galaxy assembly bias","primary_cat":"astro-ph.GA","submitted_at":"2026-05-19T04:02:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Simulations show gas cooling and stellar feedback dominate assembly bias for stellar-mass selected galaxies while star formation gives way to gas cooling for SFR-selected galaxies as number density rises.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19100","ref_index":4,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ldmppr: Location Dependent Marked Point Processes in R","primary_cat":"stat.CO","submitted_at":"2026-05-18T20:40:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ldmppr is an R package providing tools to model, simulate from, and assess goodness-of-fit for location-dependent marked point processes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18206","ref_index":132,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A tool to determine the degrees of freedom in tree-structured varying coefficient models","primary_cat":"stat.ME","submitted_at":"2026-05-18T10:45:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A formula approximating degrees of freedom for tree-structured varying coefficient models is proposed to improve BIC model selection over naive parameter counting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18066","ref_index":8,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"TIDAL: Recovering Temporal Phase for Cloud Block Storage Placement from LLM-Derived Semantics","primary_cat":"cs.OS","submitted_at":"2026-05-18T08:49:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TIDAL recovers temporal phase signals from LLM-derived semantics of provisioning metadata to enable complementary CVD placement, reducing overload frequency by 79.1% on production traces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17397","ref_index":55,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Exploring the Transitional Parameter Space of Blazars using Gamma-ray and X-ray Population Diagnostics","primary_cat":"astro-ph.HE","submitted_at":"2026-05-17T11:30:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Changing-look blazars occupy intermediate regions in gamma-ray and X-ray parameter spaces but lie statistically closer to flat-spectrum radio quasars than to BL Lac objects according to centroids, PCA, UMAP, and random-forest classification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20234","ref_index":42,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"TabPFN-MT: A Natively Multitask In-Context Learner for Tabular Data","primary_cat":"cs.LG","submitted_at":"2026-05-16T15:02:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TabPFN-MT is a multitask in-context learner for tabular data that sets a new state-of-the-art on deep multitask learning for datasets under 1000 samples while reducing inference cost from O(T) to O(1) passes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16145","ref_index":21,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Skew-adaptive conformal prediction","primary_cat":"stat.ML","submitted_at":"2026-05-15T16:26:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Develops a skew-adaptive split conformal prediction method that learns local skewness via a gauge-derived conformity score and an asinh residual model while preserving marginal validity under exchangeability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15874","ref_index":13,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Ti-iLSTM: A TinyDL Approach for Logic-Level Anomaly Detection in Industrial Water Treatment Systems","primary_cat":"cs.LG","submitted_at":"2026-05-15T11:44:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Ti-iLSTM optimizes LSTM for TinyDL to detect logic-layer deception anomalies in PLC-based IWTS, reporting F1=0.983 and AUC=0.998 on SWaT with validation on WADI.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15524","ref_index":65,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Neural Point-Forms","primary_cat":"cs.LG","submitted_at":"2026-05-15T01:44:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Neural point-forms are introduced as permutation-invariant neural layers that output learned form-comparison matrices for point clouds, with a claimed consistency proof under sampling and manifold assumptions and competitive results on synthetic and biological data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15432","ref_index":4,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"The Nova Synthetic Data Base: A Principal Component/AI Analysis of Novae Synoptic Spectra","primary_cat":"astro-ph.SR","submitted_at":"2026-05-14T21:25:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Presents the first public synthetic spectra database for novae and demonstrates a PCA/AI framework for retrieving physical properties from limited spectral data as a proof of concept for future surveys.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15373","ref_index":5,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Nonparametric inference for sublevel-set probabilities of conditional average treatment effect functions","primary_cat":"stat.ME","submitted_at":"2026-05-14T20:01:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Develops Grenander-type and debiased machine learning estimators for the sublevel-set probability curve of the CATE function, shown to be non-pathwise differentiable, along with its piecewise linear approximation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15354","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Controllable Molecular Generative Foundation Models","primary_cat":"cs.LG","submitted_at":"2026-05-14T19:27:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CoMole uses a motif-aware graph diffusion pipeline with RL to rank first in controllability on nine targets across materials and drug benchmarks while keeping validity above 0.94 without post-processing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14318","ref_index":28,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Semantic Feature Segmentation for Interpretable Predictive Maintenance in Complex Systems","primary_cat":"cs.AI","submitted_at":"2026-05-14T03:29:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Semantic segmentation decomposes monitoring features into canonical and residual components that concentrate fault-predictive information while preserving operational meaning in predictive maintenance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11247","ref_index":29,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Proof-of-Concept Simulation-Driven Digital Twin Framework for Decision-Aware Diabetes Modeling","primary_cat":"cs.LG","submitted_at":"2026-05-11T21:10:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A simulation-driven digital twin framework is shown to generate interpretable diabetes trajectories for decision-aware analysis by combining benchmark data with controlled synthetic scenarios.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"3:Mapping between input features and predictive models, including both classical machine learning methods and neural network-based approaches within the digital twin framework. Gradient boosting and tree-based ensemble methods are widely used for structured data, as they effectively capture nonlinear feature interactions while maintaining strong per- formance on moderate-sized datasets [28], [29]. For neural network models, the prediction can be expressed as ˆy=W2σ(W1x+b 1) +b 2, whereW 1 andW 2 are weight matrices,b 1 andb 2 are bias terms, andσ(·)is a nonlinear activation function. These formulations provide concrete and reproducible in- stantiations of the predictive component within the digital twin framework. Neural network-based extensions can further in-"},{"citing_arxiv_id":"2605.10722","ref_index":17,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"On Improving Graph Neural Networks for QSAR by Pre-training on Extended-Connectivity Fingerprints","primary_cat":"cs.LG","submitted_at":"2026-05-11T15:30:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Pre-training GNNs on ECFP prediction produces statistically significant QSAR gains on five of six Biogen benchmarks with OOD splits, but underperforms on heterogeneous datasets and complex endpoints like binding affinity.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"From intuition to AI: Evolution of small molecule represen- tations in drug discovery.Briefings in Bioinformatics25, bbad422 (2024). URL https://doi.org/10.1093/bib/bbad422. [16] Dablander, M., Hanser, T., Lambiotte, R. & Morris, G. M. Exploring QSAR models for activity-cliff prediction.Journal of Cheminformatics15, 47 (2023). URL https://doi.org/10.1186/s13321-023-00708-w. [17] Jiang, D.et al.Could graph neural networks learn better molecular representa- tion for drug discovery? A comparison study of descriptor-based and graph-based models.Journal of Cheminformatics13, 12 (2021). URL https://doi.org/10. 1186/s13321-020-00479-8. [18] Xia, J.et al. Understanding the Limitations of Deep Models for Molecular prop- erty prediction: Insights and Solutions."},{"citing_arxiv_id":"2605.10616","ref_index":9,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"MulTaBench: Benchmarking Multimodal Tabular Learning with Text and Image","primary_cat":"cs.LG","submitted_at":"2026-05-11T14:12:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MulTaBench is a new collection of 40 image-tabular and text-tabular datasets designed to test target-aware representation tuning in multimodal tabular models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"which contextualize the representations of unstructured modalities can push the boundaries of MMTL, and we believe that MulTaBench would be instrumental for developing true Multimodal TFMs. 2 Related Work Tabular Foundation Models.The landscape of tabular learning shifted with Prior-data Fitted Networks (PFNs) [69], which pretrain transformers over synthetic tabular datasets with in-context learning (ICL) [9]. The TabPFN family [40, 41, 34, 27] pioneered this direction. Multiple subsequent works [75, 76, 62, 103, 86, 102, 6] advanced the paradigm with improvements spanning synthetic data diversity, real-world data pretraining, and architectural scalability. Among these, ConTextTab [86] is the only PFN to incorporate textual fields, yet it does not process raw strings; instead, it"},{"citing_arxiv_id":"2605.08448","ref_index":126,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"LLM-guided Semi-Supervised Approaches for Social Media Crisis Data Classification","primary_cat":"cs.AI","submitted_at":"2026-05-08T20:15:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LG-CoTrain, an LLM-guided co-training method, outperforms classical semi-supervised baselines for crisis tweet classification in low-resource settings with 5-25 labeled examples per class.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06343","ref_index":6,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Mind the Gap? A Distributional Comparison of Real and Synthetic Priors for Tabular Foundation Models","primary_cat":"cs.AI","submitted_at":"2026-05-07T14:29:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The synthetic prior for tabular foundation models covers only a narrow part of real table distributions, but this mismatch does not degrade model generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08242","ref_index":26,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"An Explainable Unsupervised-to-Supervised Machine Learning Framework for Dietary Pattern Discovery Using UK National Dietary Survey Data","primary_cat":"q-bio.QM","submitted_at":"2026-05-07T09:05:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"An unsupervised-to-supervised ML pipeline on UK NDNS data discovers four dietary patterns, reproduces them with macro-F1 0.963 using a surrogate classifier, and interprets them via SHAP for potential clinical use.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"evaluates whether the cluster assignments can be reproduced on held-out participants from the dietary feature representation and provides a model to which SHAP can be applied. Consequently, classifier performance is interpreted as cluster reproducibility and explainability evidence, not as evidence of clinical or external predictive validity. Three classifiers were compared: Logistic Regression, Random Forest and XGBoost [26-27]. Performance was evaluated using accuracy, macro-F1 and weighted-F1, with macro-F1 treated as the primary metric because cluster sizes were not identical. SHAP analysis was applied to the best- performing classifier [21]. Global SHAP importance identified influential features overall, while class- specific SHAP summaries were used to interpret the dietary drivers of each pattern."},{"citing_arxiv_id":"2605.16326","ref_index":19,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Machine Learning Framework for EEG-Based Prediction of Treatment Efficacy in Chronic Neck Pain","primary_cat":"q-bio.QM","submitted_at":"2026-05-05T10:09:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A preprocessing pipeline for resting-state and motor-task EEG is described to support future machine learning models that predict treatment efficacy in chronic neck pain.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03266","ref_index":281,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Intrinsic effective sample size for manifold-valued Markov chain Monte Carlo via kernel discrepancy","primary_cat":"stat.ML","submitted_at":"2026-05-05T01:37:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"An intrinsic effective sample size for manifold MCMC is defined via kernel discrepancy as the number of independent draws yielding equivalent expected squared discrepancy to the target.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01484","ref_index":119,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Evaluating LLMs on Large-Scale Graph Property Estimation via Random Walks","primary_cat":"cs.LG","submitted_at":"2026-05-02T15:11:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EstGraph benchmark evaluates LLMs on estimating properties of very large graphs from random-walk samples that fit in context limits.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"due to the context length of LLMs. Instead, depend- ing on the task, we can summarize the statistics of the walks that can easily scale with the graph size. 4 Estimation of Graph Properties 4.1 Estimation of Number of Nodes and Edges Estimating the size of a graph and other proper- ties has been extensively studied in the graph lit- Graph description: [(149, 32), (145, 220), (126, 222), (15, 77), (190, 191), (223, 224), (18, 232), (137, 174), (18, 19), (247, 52), (157, 178), (11, 162), (160, 2), (174, 246), (114, 37), (120, 213), (132, 133), (5, 11), (3, 4), (142, 53), (24, 29), (105, 101), (5, 13), (112, 56), (31, 34), (165, 106), (32, 236), (220, 203), (230, 231), (143, 145), (17, 20), (35, 36), (158, 30), (14, 66), (89, 91), (156, 157), (95, 61), (135, 172), (112, 138), (161, 56), (123, 106), (147, 150), (145, 136), (198, 90), (18, 20), (202, 13), (33, 114), (39, 40), (143, 144), (82, 84), (5, 14), (169, 168), (119, 213), (240, 171), (121, 192), (239, 95), (126, 218), (44, 87), (197, 198), (240, 167), (43, 169), (18, 53), (89, 90), (0, 2), (98, 95), (207, 168), (5, 15), (114, 68), (47, 48), (92, 96), (74, 32), (216, 240), (67, 70), (46, 167), (156, 158), (132, 136), (55, 245), (155, 168), (133, 2), (32, 227) …+Task description Random Walks RW-1 description …RW-2 description …RW-3 description …RW-4 description …Task description+ [ ] [ ] Running out of context length LLM full graph description Graph description within context length Figure 2: Figure illustrates the issue of exceeding context length as the graph size increases. Random walks on graphs provide efficient way of extracting and encoding graph-related information. erature especially for social networks like Face- book or Twitter."},{"citing_arxiv_id":"2605.00618","ref_index":12,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Is Textual Similarity Invariant under Machine Translation? Evidence Based on the Political Manifesto Corpus","primary_cat":"cs.CL","submitted_at":"2026-05-01T12:41:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Machine translation preserves embedding similarity structure for ten languages but distorts it for four in the Manifesto Corpus, via a new non-inferiority testing framework.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"resid,L is the model-based estimate of the marginal standard deviation of a correlation in the reference class for the hypothesis at hand (original language model pairs for the baseline, best-model, and performance-equivalence hypotheses; mul- tilingual model pairs for the multilingual hypothesis), derived from a crossed multi-membership random-effects fit [12] that pools across all reference-class pairs in language L. We report our primary results for κ= 1 , which corresponds to the invariance margin being equal to the typical within-reference-class heterogeneity - so translation is deemed invariant when its induced perturbation is no larger than that arising from natural disagreement among reference-class model"},{"citing_arxiv_id":"2605.00538","ref_index":70,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Vesselpose: Vessel Graph Reconstruction from Learned Voxel-wise Direction Vectors in 3D Vascular Images","primary_cat":"cs.CV","submitted_at":"2026-05-01T09:34:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Vesselpose predicts voxel-wise direction vectors to extend the TEASAR algorithm for topologically accurate vascular graph reconstruction from 3D images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00363","ref_index":271,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Profile Likelihood Inference for Anisotropic Hyperbolic Wrapped Normal Models on Hyperbolic Space","primary_cat":"math.ST","submitted_at":"2026-05-01T02:54:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The profile maximum likelihood estimator for the location in anisotropic hyperbolic wrapped normal models is strongly consistent, asymptotically normal, and attains the Hájek-Le Cam minimax lower bound under squared geodesic loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27775","ref_index":49,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Data-Efficient Indentation Size Effect Correction in Steels Using Machine Learning and Physics-Guided Augmentation","primary_cat":"cond-mat.mtrl-sci","submitted_at":"2026-04-30T12:12:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Physics-guided data augmentation combined with neural networks enables accurate indentation size effect correction in steels from small sets of shallow nanoindentation measurements, outperforming Nix-Gao in the shallow regime.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"minimization with early stopping (patience = 15 rounds) evaluated against the internal test partition [47]. (4) Two feed-forward Neural Network architectures were evaluated with ReLU activations and the Adam optimizer (up to 10,000 iterations) [48]. The unconstrained baseline (64-64) provides a standard non-linear mapping reference. The constrained architecture (64-8-64) introduces a narrow 8-neuron intermediate layer [49]. Because Pmax, Wtot, and hmax reflect the same loading history at different integration levels, constraining the representation to 8 latent dimensions is designed to force compression of collinear load-scaling information, retaining only features that distinguish intrinsic material strength from depth-dependent geometric artifacts. How the latent activations of this layer are analyzed by"},{"citing_arxiv_id":"2604.25304","ref_index":25,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"RCProb: Probabilistic Rule Extraction for Efficient Simplification of Tree Ensembles","primary_cat":"cs.LG","submitted_at":"2026-04-28T07:12:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RCProb uses Dirichlet-smoothed class priors and Beta-smoothed condition likelihoods in a Naive Bayes formulation to extract rules from tree ensembles approximately 22 times faster than RuleCOSI+ while maintaining competitive accuracy and producing more compact rule sets on 33 benchmark datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25196","ref_index":5,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Knowledge-Data Dually Driven Paradigm for Accurate Landslide Susceptibility Prediction under Data-Scarce Conditions Using Geomorphic Priors and Tabular Foundation Model","primary_cat":"cs.LG","submitted_at":"2026-04-28T04:05:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A knowledge-data dual paradigm using geomorphic priors and a tabular foundation model achieves baseline-level landslide susceptibility prediction accuracy with only 30% of typical data in tested regions.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"distills the latent regularities inherent within the scarce data itself, effectively preventing overfitting. Within the proposed paradigm, the tabular foundation model receives conditioning factors incorporat- ing geomorphic prior knowledge, is fitted on labeled SU samples (landslide/non-landslide), and outputs landslide occurrence probabilitiesˆpj∈[0, 1] for all slope units: ˆpj =f T F M \u0010 x(S U) j |D (S U) train \u0011 (5) wherex (S U) j represents the comprehensive conditioning factors of the j-th slope unit (integrating the physics- based geomorphic prior knowledge alongside supplementary topographic and environmental factors), and D(S U) train denotes the available scarce training dataset. Through this profound integration of geomorphic prior knowledge and foundation model inference,"},{"citing_arxiv_id":"2604.24516","ref_index":7,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"StarCLR: Contrastive Learning Representation for Astronomical Light Curves","primary_cat":"astro-ph.SR","submitted_at":"2026-04-27T14:18:49+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StarCLR pretrains on TESS light curves via contrastive learning on overlapping subsequences and improves variable star classification F1 scores over scratch-trained models when fine-tuned on TESS, ZTF, and Gaia.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"corresponds to the size of the key and query vectors, typically defined asd k =H/k, whereHis the hidden dimension of the input andkis the number of attention heads. The scaling factor √dk alleviates issues of vanishing or exploding gradients, thereby improving training stability. The outputs of thekattention heads are concatenated and linearly transformed to produce the final attention output: MultiHead(Q, K, V) = Concatenate(head 1,head 2,· · ·,head k)W O (7) whereW O is a learnable projection matrix for integrating the head outputs. This design enables the model to simultaneously capture information from multiple feature subspaces, thereby strengthening its capability for sequence modeling and representation learning. 3.4.Pretrain loss The contrastive learning strategy adopted in this work follows Yue et al."},{"citing_arxiv_id":"2604.23824","ref_index":9,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Resource-Lean Lexicon Induction for German Dialects","primary_cat":"cs.CL","submitted_at":"2026-04-26T18:09:56+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Random forests on string similarity features outperform LLMs for German dialect lexicon induction and boost dialect information retrieval by up to 50% in recall.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22328","ref_index":26,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"FETS Benchmark: Foundation Models Outperform Dataset-specific Machine Learning in Energy Time Series Forecasting","primary_cat":"cs.LG","submitted_at":"2026-04-24T08:00:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Foundation models outperform dataset-specific machine learning in energy time series forecasting across 54 datasets in 9 categories.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"engineering often outweighs architectural complexity, with gradient boosting and quantile regression emerging as strong probabilistic baselines. In line with these findings, Gradient Boosted Trees like XGBoost dominate recent energy forecasting competitions [25, 4, 5], while random forest remains a robust, stable, and simple baseline widely adopted in practical applications due to its minimal tuning requirements and interpretability [26]. Large-scale empirical comparisons further report that such compact, task-specifically tuned tree ensembles remain highly competitive with task-specific deep learning architectures, as the limited historical data typical of individual energy forecasting tasks favor small, low-variance models with strong inductive biases and hand-crafted features over larger,"},{"citing_arxiv_id":"2604.22084","ref_index":23,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Generating Synthetic Malware Samples Using Generative AI","primary_cat":"cs.LG","submitted_at":"2026-04-23T21:33:05+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Opcode-sequence generative models produce synthetic malware data that raises minor-class classification accuracy by up to 60% and overall detection to 96%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21042","ref_index":2,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Interpretable Quantile Regression by Optimal Decision Trees","primary_cat":"cs.LG","submitted_at":"2026-04-22T19:40:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A novel algorithm learns sets of optimal quantile regression trees to predict full conditional distributions interpretably and efficiently.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19690","ref_index":3,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Is the `Known' Enough? An Integrated Machine Learning Framework for Eclipsing Binary Classification and Parameter Estimation Based on Well-Characterized Systems","primary_cat":"astro-ph.SR","submitted_at":"2026-04-21T17:12:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"An ensemble ML framework achieves 90.7% morphology classification accuracy and R² values of 0.77–0.92 for key parameters on held-out test data, with external validation against OGLE and Kepler catalogs.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"ensure a consistent input format for feature extraction, any photometric data provided in magnitudes were converted to flux units prior to processing. For training data quality and physical consistency, systems were excluded from the initial compilation if they met any of the following criteria: (1) large gaps in orbital phase coverage preventing reliable eclipse characterization; (2) extreme photometric scatter indicating poor data quality; (3) light curve morphology inconsistent with eclipsing binary signatures; or (4) absence of both primary and secondary minima. Furthermore, every light curve in the training set underwent a visual inspection to guarantee the absence of artifacts or non-physical outliers that could compromise the normalization process. For the training dataset, comprised of high-quality, visually verified light curves, we employed"},{"citing_arxiv_id":"2604.18910","ref_index":1,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Predicting Redshift in Seyfert Galaxies Using Machine Learning","primary_cat":"astro-ph.GA","submitted_at":"2026-04-20T23:27:10+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Random Forest regression on combined optical plus mid-infrared colors yields NMAD of 0.0188, R-squared of 0.9561, and 0.294 percent outliers for photometric redshifts in 23,797 Seyfert II galaxies selected from SDSS and WISE.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"We restrict the sample to spectroscopically observed objects withclass = GALAXYandsubClass = AGN, thereby adopting the internal SDSS classification scheme to identify galaxies whose emission-line ratios are consistent with AGN activity. This clas- sification is assigned by the SDSS spectroscopic pipeline based on the criterion of Bolton et al. (2012): log \u0012 [OIII] Hβ \u0013 >0.7−1.2×log \u0012 [NII] Hα \u0013 −0.4,(1) which separates AGN-dominated systems from star-forming galaxies. The resulting parent sample consists of 23,797 objects. This choice is motivated by the need for a uniform and re- producible sample definition. Automated classifications ensure consistency across the dataset and reduce selection biases intro- duced by heterogeneous line-ratio criteria."},{"citing_arxiv_id":"2604.18579","ref_index":10,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"The T16 Planet Hunt: 10,000 New Planet Candidates from TESS Cycle 1 and the Confirmation of a Hot Jupiter Around TIC 183374187","primary_cat":"astro-ph.EP","submitted_at":"2026-04-20T17:59:06+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A transit search on TESS Cycle 1 full-frame images produced 10,091 new planet candidates down to T=16 mag, more than doubling the known TESS total, with one hot Jupiter confirmed by radial velocity.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Plugging in our best fit values we get a tidal circulariza- tion timescale oft e ∼0.5 Gyrs. The preference for slight eccentricity is marginal and could thus be an artifact of our fitting procedure. Equation 13 in Hara et al. (2019) allows the quantification of this so-called Lucy-Sweeney bias (Lucy & Sweeney 1971) if the true eccentricity is assumed to be near zero: b= r π 4−π σe (10) whereσ e is the measured uncertainty on the eccentric- ity. We measured a uncertainty ofσ e = 0.15, resulting in a bias ofb≈0.287. This result indicates that our fit is entirely compatible with a circular orbit. In ad- dition, the presence of potential additional planets in the system could drive eccentricity. If real, however, the 17 Figure 9.Period distribution of our candidates."},{"citing_arxiv_id":"2604.18083","ref_index":13,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Implicit neural representations as a coordinate-based framework for continuous environmental field reconstruction from sparse ecological observations","primary_cat":"cs.LG","submitted_at":"2026-04-20T10:59:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Implicit neural representations enable stable, resolution-independent reconstruction of continuous environmental fields from sparse and irregular ecological data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17622","ref_index":7,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"STRIKE: Additive Feature-Group-Aware Stacking Framework for Credit Default Prediction","primary_cat":"cs.LG","submitted_at":"2026-04-19T21:21:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"STRIKE improves credit default prediction AUC-ROC by training independent models on feature groups and aggregating their outputs via a meta-learner, outperforming tree baselines and conventional stacking on three real datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13392","ref_index":8,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ReSS: Learning Reasoning Models for Tabular Data Prediction via Symbolic Scaffold","primary_cat":"cs.AI","submitted_at":"2026-04-15T01:43:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReSS extracts decision paths from trees as scaffolds to guide LLM reasoning generation, fine-tunes the LLM on the resulting dataset with scaffold-invariant augmentation, and reports up to 10% gains on medical and financial tabular benchmarks with new faithfulness metrics.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"of progress in tabular learning (Breiman et al., 1984; Chen et al., 2016; Si et al., 2024), constructing models that jointly achieve strong performance and meaningful reasoning re- mains a fundamental challenge. Traditional machine learning methods have long dominated tabular prediction, with tree-based approaches such as deci- sion tree (Breiman et al., 1984), random forests (Breiman, 2001), XGBoost (Chen et al., 2016), and LightGBM (Ke et al., 2017) achieving strong empirical performance. From a reasoning perspective, however, these models remain fun- damentally limited. Tree ensemble methods do not expose an explicit, instance-level decision process, while single decision trees express reasoning solely through symbolic rules defined over low-level feature thresholds."},{"citing_arxiv_id":"2604.12109","ref_index":7,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Identifying Changing-Look AGN Transitions in Light Curve Data with the Zwicky Transient Facility","primary_cat":"astro-ph.GA","submitted_at":"2026-04-13T22:32:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A criterion of |Δg| > 0.4 mag and |Δ(g-r)| > 0.2 mag detects photometric CL-AGN transitions in 9.6% of known hosts with 1.6% false positive rate from simulations.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"and on what timescales. 6.3.4.Propagation of a heating/cooling front N. P. Ross et al. (2018) proposed a scenario in which changing-look quasars are triggered by the propagation of a heating or cooling front through the accretion disk. The timescale for this to occur is given by tfront ∼20 yrs \u0012 h/R 0.05 \u0013−1\u0010 α 0.03 \u0011−1\u0012 MBH 108M⊙ \u0013\u0012 R 150rs \u00133/2 , (7) 16 where the viscosity and scale height of the disk mod- erate the speed at which the front can propagate (D. Stern et al. 2018, equation 7). N. P. Ross et al. (2018) proposed that changes at the innermost stable circular orbit, or ISCO (wherer ISCO ≡6r s) could deflate the in- ner disk, sending a cooling front outward. At later times a heating front would propagate inward, re-inflating the"},{"citing_arxiv_id":"2604.10293","ref_index":17,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Impact of Validation Strategy on Machine Learning Performance in EEG-Based Alcoholism Classification","primary_cat":"eess.SP","submitted_at":"2026-04-11T17:17:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Nested cross-validation reveals optimistic bias in standard validation for EEG alcoholism classification, with AdaBoost reaching 78.3% accuracy and most model differences not statistically significant per McNemar's test.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"improve EEG classification robustness across subjects and recording conditions [5], [12]-[15]. Among ensemble meth- ods, AdaBoost [16] has demonstrated strong performance in biomedical classification tasks by adaptively focusing on hard- to-classify samples through iterative reweighting, making it particularly suitable for imbalanced or noisy data. Random Forest [17] employs bootstrap aggregation (bagging) of decision trees, where each tree is trained on a random subset of features and data samples. This ensemble approach reduces variance arXiv:2604.10293v1 [eess.SP] 11 Apr 2026 2 and improves stability compared to individual decision trees, making it robust to overfitting in high-dimensional feature spaces."},{"citing_arxiv_id":"2604.08021","ref_index":28,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SynQL: A Controllable and Scalable Rule-Based Framework for SQL Workload Synthesis for Performance Benchmarking","primary_cat":"cs.DB","submitted_at":"2026-04-09T09:20:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SynQL synthesizes diverse, execution-ready SQL workloads by deterministically traversing foreign-key graphs to populate ASTs, yielding high topological entropy and cost-model training data with R² ≥ 0.79 on held-out sets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05225","ref_index":4,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"fastml: Guarded Resampling Workflows for Safer Automated Machine Learning in R","primary_cat":"stat.CO","submitted_at":"2026-04-06T22:41:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"fastml is an R package that enforces leakage-free preprocessing through guarded resampling and provides a unified interface for safer automated ML including survival analysis.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"This parameterization is used to ensure positivity and stable estimation while still yielding hazard rates that correspond to piecewise-constant intervals. The cumulative hazard at time 𝑡 is computed as the sum of hazards over all completed intervals plus the partial contribution of the interval containing 𝑡. The survival function is then 𝑆(𝑡) = exp(−𝐻(𝑡)), (4) and the density follows directly from the hazard and survival functions. These expressions cor- respond to the standard piecewise constant hazard formulation, while the implementation itself evaluates them procedurally. Model fitting is performed via flexsurv::flexsurvreg(), with fastml passing the custom dis- tribution definition and the normalized cutpoints through its internal fitting utilities."},{"citing_arxiv_id":"2604.16378","ref_index":8,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Reciprocal Co-Training (RCT): Coupling Gradient-Based and Non-Differentiable Models via Reinforcement Learning","primary_cat":"cs.CL","submitted_at":"2026-03-24T20:21:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RCT couples an LLM and Random Forest via RL feedback so each augments the other's features and rewards, producing consistent gains on three medical datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03274","ref_index":21,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Financial Dynamics and Interconnected Risk of Liquid Restaking","primary_cat":"q-fin.GN","submitted_at":"2026-03-23T10:58:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Renzo liquid restaking revenue is primarily predicted by EigenLayer value locked, token yield, and multi-blockchain expansion, with current bridge risks not imposing systemic threats to the restaking ecosystem.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.04925","ref_index":4,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Detecting RAG Advertisements Across Advertising Styles","primary_cat":"cs.IR","submitted_at":"2026-03-05T08:16:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Entity recognition models detect ads in RAG responses effectively and stay robust when advertisers switch styles, while lightweight models like random forests and SVMs become brittle under the same changes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.21876","ref_index":28,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Comparative Evaluation of Machine Learning Models for Predicting Donor Kidney Discard","primary_cat":"stat.AP","submitted_at":"2026-02-25T13:00:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"On 4080 German deceased donors, an ensemble ML model reached MCC 0.76 for kidney discard prediction, with standardized preprocessing and feature selection proving more important than the specific algorithm chosen.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.01548","ref_index":8,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Photometric Redshift PDFs via Neural Network Classification for DESI Legacy Imaging Surveys and Pan-STARRS","primary_cat":"astro-ph.GA","submitted_at":"2026-02-02T02:28:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Neural network classification with CRPS optimization produces calibrated photometric redshift PDFs for DESI Legacy and Pan-STARRS data, achieving σ_NMAD of 0.0153 on LSDR10 and outperforming regression methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.01119","ref_index":63,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Community-Based Early-Stage Chronic Kidney Disease Screening using Explainable Machine Learning for Low-Resource Settings","primary_cat":"cs.LG","submitted_at":"2026-01-03T08:43:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Machine learning models trained on Bangladeshi community data achieve 89-90% balanced accuracy for early CKD detection using few accessible features, outperforming traditional screening tools and generalizing across external datasets from India, UAE, and Bangladesh.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}