{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2012:CW7NCLNJFMVSPLLQZI46KBHU5P","short_pith_number":"pith:CW7NCLNJ","schema_version":"1.0","canonical_sha256":"15bed12da92b2b27ad70ca39e504f4ebd98b8ea06c9145b0c836e6167c281240","source":{"kind":"arxiv","id":"1207.4371","version":1},"attestation_state":"computed","paper":{"title":"Computing n-Gram Statistics in MapReduce","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.DB","cs.DC"],"primary_cat":"cs.IR","authors_text":"Klaus Berberich, Srikanta Bedathur","submitted_at":"2012-07-18T13:21:10Z","abstract_excerpt":"Statistics about n-grams (i.e., sequences of contiguous words or other tokens in text documents or other string data) are an important building block in information retrieval and natural language processing. In this work, we study how n-gram statistics, optionally restricted by a maximum n-gram length and minimum collection frequency, can be computed efficiently harnessing MapReduce for distributed data processing. We describe different algorithms, ranging from an extension of word counting, via methods based on the Apriori principle, to a novel method Suffix-\\sigma that relies on sorting and "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1207.4371","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2012-07-18T13:21:10Z","cross_cats_sorted":["cs.DB","cs.DC"],"title_canon_sha256":"b7224f700f743d9a00671d700e560b164ecc9a2f2dd1c4e0b1ee767b6febaf1a","abstract_canon_sha256":"d5ef7e957e9dfb3cca2f1a0697c58d5730b443b8268ffe6b2ab673ec54a0804a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:50:46.892884Z","signature_b64":"Fv9L95XGe2eM1ED3Zhteclshhv+39K9FdT6UGacUCpZ88bLWneHzMI5KlZUeA5NdhdunQmAPU2uIXgF5u0t9CA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"15bed12da92b2b27ad70ca39e504f4ebd98b8ea06c9145b0c836e6167c281240","last_reissued_at":"2026-05-18T03:50:46.892220Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:50:46.892220Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Computing n-Gram Statistics in MapReduce","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.DB","cs.DC"],"primary_cat":"cs.IR","authors_text":"Klaus Berberich, Srikanta Bedathur","submitted_at":"2012-07-18T13:21:10Z","abstract_excerpt":"Statistics about n-grams (i.e., sequences of contiguous words or other tokens in text documents or other string data) are an important building block in information retrieval and natural language processing. In this work, we study how n-gram statistics, optionally restricted by a maximum n-gram length and minimum collection frequency, can be computed efficiently harnessing MapReduce for distributed data processing. We describe different algorithms, ranging from an extension of word counting, via methods based on the Apriori principle, to a novel method Suffix-\\sigma that relies on sorting and "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1207.4371","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1207.4371","created_at":"2026-05-18T03:50:46.892325+00:00"},{"alias_kind":"arxiv_version","alias_value":"1207.4371v1","created_at":"2026-05-18T03:50:46.892325+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1207.4371","created_at":"2026-05-18T03:50:46.892325+00:00"},{"alias_kind":"pith_short_12","alias_value":"CW7NCLNJFMVS","created_at":"2026-05-18T12:27:01.376967+00:00"},{"alias_kind":"pith_short_16","alias_value":"CW7NCLNJFMVSPLLQ","created_at":"2026-05-18T12:27:01.376967+00:00"},{"alias_kind":"pith_short_8","alias_value":"CW7NCLNJ","created_at":"2026-05-18T12:27:01.376967+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/CW7NCLNJFMVSPLLQZI46KBHU5P","json":"https://pith.science/pith/CW7NCLNJFMVSPLLQZI46KBHU5P.json","graph_json":"https://pith.science/api/pith-number/CW7NCLNJFMVSPLLQZI46KBHU5P/graph.json","events_json":"https://pith.science/api/pith-number/CW7NCLNJFMVSPLLQZI46KBHU5P/events.json","paper":"https://pith.science/paper/CW7NCLNJ"},"agent_actions":{"view_html":"https://pith.science/pith/CW7NCLNJFMVSPLLQZI46KBHU5P","download_json":"https://pith.science/pith/CW7NCLNJFMVSPLLQZI46KBHU5P.json","view_paper":"https://pith.science/paper/CW7NCLNJ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1207.4371&json=true","fetch_graph":"https://pith.science/api/pith-number/CW7NCLNJFMVSPLLQZI46KBHU5P/graph.json","fetch_events":"https://pith.science/api/pith-number/CW7NCLNJFMVSPLLQZI46KBHU5P/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/CW7NCLNJFMVSPLLQZI46KBHU5P/action/timestamp_anchor","attest_storage":"https://pith.science/pith/CW7NCLNJFMVSPLLQZI46KBHU5P/action/storage_attestation","attest_author":"https://pith.science/pith/CW7NCLNJFMVSPLLQZI46KBHU5P/action/author_attestation","sign_citation":"https://pith.science/pith/CW7NCLNJFMVSPLLQZI46KBHU5P/action/citation_signature","submit_replication":"https://pith.science/pith/CW7NCLNJFMVSPLLQZI46KBHU5P/action/replication_record"}},"created_at":"2026-05-18T03:50:46.892325+00:00","updated_at":"2026-05-18T03:50:46.892325+00:00"}