{"paper":{"title":"Infini-News: Efficiently Queryable Access to 1.3 Billion Processed Common Crawl News Articles","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Jana Lasser, Kirill Solovev, Ruggero Marino Lazzaroni","submitted_at":"2026-05-18T12:52:14Z","abstract_excerpt":"Large-scale news corpora support a wide range of research in Computational Social Science and NLP, yet access remains constrained: commercial archives impose prohibitive costs and licensing restrictions, while open alternatives like Common Crawl's CC-News require terabyte-scale storage and computationally intensive processing. We present Infini-News, a retrieval toolkit and index for the entire CC-News archive from August 2016 to the latest available snapshot. Our contributions are threefold. First, we extract, clean the text, and parse the structured metadata of over 1.35B articles. Second, w"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.18337","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.18337/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"ai_meta_artifact","ran_at":"2026-05-19T23:33:35.168707Z","status":"skipped","version":"1.0.0","findings_count":0},{"name":"claim_evidence","ran_at":"2026-05-19T23:21:58.835267Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"af62def547da31b3eca1e94305899557f4ab12815d5b63f9b28a72d24e2dea31"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"}