{"total":25,"items":[{"citing_arxiv_id":"2605.30728","ref_index":82,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reducing the GPU Memory Bottleneck with Lossless Compression for ML -- Extended","primary_cat":"cs.LG","submitted_at":"2026-05-29T01:45:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IBP is a new lossless bit-packing algorithm with GPU-optimized decompression that speeds up GNN training by 74%, DLRM lookups by 180%, and LLM inference by 24% by reducing CPU-GPU data movement.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30195","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What drives performance in molecular MPNNs? An operator-level factorial benchmark","primary_cat":"cond-mat.mtrl-sci","submitted_at":"2026-05-28T16:34:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Operator-level factorial benchmark of 84 MPNN configurations finds message-seed initialization and node-edge fusion drive performance on MoleculeNet tasks more than node updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20248","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Graph Transductive Sharpening: Leveraging Unlabeled Predictions in Node Classification","primary_cat":"cs.LG","submitted_at":"2026-05-18T06:47:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Transductive Sharpening adds an entropy-minimization term on unlabeled-node predictions to the training objective for graph node classification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00670","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Robust Multimodal Recommendation via Graph Retrieval-Enhanced Modality Completion","primary_cat":"cs.IR","submitted_at":"2026-05-01T13:50:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GRE-MC retrieves relevant subgraphs and uses a graph transformer plus sparse codebook to complete missing modalities, outperforming prior methods on recommendation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23139","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GreenDyGNN: Runtime-Adaptive Energy-Efficient Communication for Distributed GNN Training","primary_cat":"cs.DC","submitted_at":"2026-04-25T04:43:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GreenDyGNN applies Double-DQN to adapt cache management in distributed GNN training, cutting energy by up to 43% under congestion versus static policies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19004","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Ocean: Fast Estimation-Based Sparse General Matrix-Matrix Multiplication on GPU","primary_cat":"cs.DC","submitted_at":"2026-04-21T02:46:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Ocean uses HyperLogLog estimators to skip the costly symbolic phase of GPU SpGEMM, pairs it with dynamic workflow choice and a shared-plus-global hash accumulator, and reports 1.4-2.8x speedups over prior GPU implementations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18939","ref_index":128,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TabEmb: Joint Semantic-Structure Embedding for Table Annotation","primary_cat":"cs.LG","submitted_at":"2026-04-21T00:25:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TabEmb decouples LLM-based semantic column embeddings from graph-based structural modeling to produce joint representations that improve table annotation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18913","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LogosKG: Hardware-Optimized Scalable and Interpretable Knowledge Graph Retrieval","primary_cat":"cs.CL","submitted_at":"2026-04-20T23:32:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LogosKG delivers a novel hardware-aligned system for efficient multi-hop retrieval on billion-edge knowledge graphs without sacrificing fidelity, demonstrated via biomedical KG-LLM applications.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17834","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AsyncSparse: Accelerating Sparse Matrix-Matrix Multiplication on Asynchronous GPU Architectures","primary_cat":"cs.DC","submitted_at":"2026-04-20T05:44:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AsyncSparse presents BCSR and WCSR kernels that use TMA and warp specialization to accelerate SpMM, outperforming prior libraries by 1.47-6.24x on SuiteSparse and achieving 2.66x end-to-end speedup on Qwen2.5-7B at 90% block sparsity.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"5-7B prefill at 90% block sparsity with 64K tokens over cuDNN/cuBLAS. Index Terms-structured sparsity, SpMM, sparse linear alge- bra, kernel optimization, performance evaluation I. INTRODUCTION Sparse Matrix-Matrix Multiplication (SpMM) is a performance-critical primitive across scientific computing [1], [2], graph analytics [3], [4], and deep learning inference [5]- [7]. As problem sizes and model scales grow, SpMM performance on GPU accelerators increasingly dominates end-to-end runtime and cost. To harness the massive computate power of contemporary GPUs, applications frequently enforce block sparsity, where nonzero elements in the sparse operand are clustered into dense sub-blocks [8]-[10]. In deep learning, structured pruning techniques [11], [12] drive this trend,"},{"citing_arxiv_id":"2604.16715","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Scalable and Adaptive Parallel Training of Graph Transformer on Large Graphs","primary_cat":"cs.DC","submitted_at":"2026-04-17T21:29:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A new distributed framework for graph transformer training auto-selects parallel strategies and optimizes sparse operations to deliver up to 6x speedup on 8 GPUs and 78% memory reduction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15833","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Modern Structure-Aware Simplicial Spatiotemporal Neural Network","primary_cat":"cs.LG","submitted_at":"2026-04-17T08:35:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ModernSASST is the first simplicial complex-based spatiotemporal model that combines random walks on high-dimensional complexes with parallelizable temporal convolutional networks for efficient high-order topology capture.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07492","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Cluster Attention for Graph Machine Learning","primary_cat":"cs.LG","submitted_at":"2026-04-08T18:33:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Cluster attention uses off-the-shelf community detection to define attention scopes within graph clusters, augmenting MPNNs and Graph Transformers to achieve larger receptive fields with preserved structural inductive biases and improved performance on diverse graph datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02651","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Communication-free Sampling and 4D Hybrid Parallelism for Scalable Mini-batch GNN Training","primary_cat":"cs.LG","submitted_at":"2026-04-03T02:30:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ScaleGNN uses communication-free sampling and 4D parallelism to scale mini-batch GNN training to 2048 GPUs, achieving 3.5x speedup over prior state-of-the-art on ogbn-products.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"that uses 3D parallelism to distribute the workload across GPUs. GNNPipe [42] and Mithril [43] explore pipelined layer-level model parallelism, partitioning GNN layers across GPUs to reduce communication volume. These systems handle large-scale graphs effectively, but full-graph training remains expensive on very large graphs. Mini-batch systems.DistDGL [17] extends the Deep Graph Library [44] to multiple machines. It partitions the graph and stores vertex features in a distributed key-value store. Each worker runs sampling and training locally but must fetch remote features through the network, which can become a bottleneck at scale. MassiveGNN [19] builds on DistDGL with optimized feature fetching and supports training on graphs with billions of edges."},{"citing_arxiv_id":"2602.22822","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"FlexMS is a flexible framework for benchmarking deep learning-based mass spectrum prediction tools in metabolomics","primary_cat":"cs.AI","submitted_at":"2026-02-26T10:05:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FlexMS is a new flexible benchmarking framework that lets researchers dynamically combine deep learning architectures and evaluate their mass spectrum prediction performance on public metabolomics datasets using multiple metrics and retrieval tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.20178","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SHIRO: Near-Optimal Communication Strategies for Distributed Sparse Matrix Multiplication","primary_cat":"cs.DC","submitted_at":"2025-12-23T09:16:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SHIRO achieves geometric mean speedups of 221.5x to 8.8x over four baselines in distributed SpMM on up to 128 GPUs by exploiting sparsity patterns and two-tier network topologies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.12642","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Torch Geometric Pool: the PyTorch library for pooling in Graph Neural Networks","primary_cat":"cs.LG","submitted_at":"2025-12-14T11:15:09+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A new open-source library standardizes 20 hierarchical graph pooling operations under one SRCL interface with uniform outputs and batch handling for PyTorch Geometric.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.17113","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AutoGraphAD: Unsupervised network anomaly detection using Variational Graph Autoencoders","primary_cat":"cs.CR","submitted_at":"2025-11-21T10:22:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AutoGraphAD applies a heterogeneous variational graph autoencoder with unsupervised and contrastive learning to detect network anomalies on connection-IP graphs without labeled data, achieving comparable performance to Anomal-E with over an order of magnitude faster training and inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.01801","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Detecting LLM-Generated Spam Reviews by Integrating Language Model Embeddings and Graph Neural Network","primary_cat":"cs.CL","submitted_at":"2025-10-02T08:42:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces FraudSquad, a hybrid model using language model embeddings and a gated graph transformer that outperforms baselines on newly created LLM-generated spam review datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.24276","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"G-reasoner: Foundation Models for Unified Reasoning over Graph-structured Knowledge","primary_cat":"cs.AI","submitted_at":"2025-09-29T04:38:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"G-reasoner uses QuadGraph abstraction and a 34M-parameter graph foundation model integrated with LLMs to enable scalable reasoning over diverse graph-structured knowledge, outperforming baselines on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.04018","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Modal Decomposition and Identification for a Population of Structures Using Physics-Informed Graph Neural Networks and Transformers","primary_cat":"cs.CE","submitted_at":"2025-05-06T23:31:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A physics-informed GNN-transformer model performs unsupervised modal decomposition and identification for populations of structures from sparse dynamic measurements.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.10665","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Pretrained Event Classification Model for High Energy Physics Analysis","primary_cat":"hep-ph","submitted_at":"2024-12-14T03:45:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A GNN pretrained on 120M simulated HEP events generalizes to unseen processes and ATLAS data; fine-tuning boosts accuracy especially with small datasets, with CKA showing preserved encoders but altered intermediate layers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.01308","ref_index":66,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"How Hard Is It for Message-Passing GNNs to Simulate One Weisfeiler-Lehman Color-Refinement Step?","primary_cat":"cs.LG","submitted_at":"2024-10-02T08:01:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Oblivious MPGNNs cannot simulate WL color refinement with shallow depth and small messages without randomness; bounded-error randomness enables logarithmic resources for large color sets, while small color sets force layer-message trade-offs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.09333","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Learning Spatial-Preserving Hierarchical Representations for Digital Pathology","primary_cat":"cs.CV","submitted_at":"2024-06-13T17:14:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPAN is a hierarchical attention framework that constructs multi-scale pyramid representations from single-scale patch inputs for WSI classification and segmentation while preserving spatial relationships.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.06335","ref_index":225,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Software and computing for Run 3 of the ATLAS experiment at the LHC","primary_cat":"hep-ex","submitted_at":"2024-04-09T14:19:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"ATLAS reports on its Run 3 software infrastructure for data management, workflows, databases, validation, and physics analysis tools at the LHC.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2105.14491","ref_index":59,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"How Attentive are Graph Attention Networks?","primary_cat":"cs.LG","submitted_at":"2021-05-30T10:17:58+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GAT uses static attention where neighbor rankings ignore the query node and thus cannot express some graph problems; GATv2 enables dynamic attention and outperforms GAT on 11 OGB and other benchmarks with equal parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}