{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2018:I5PZIUE6XGVFEH4F3ZC6XN643X","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"4ef2965a641ca44609d7dcc872dfdf7b59efb1208e8d2d949c2258dd3a24cb9d","cross_cats_sorted":["cs.DL","cs.IR","physics.soc-ph"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-12-19T17:10:14Z","title_canon_sha256":"78880c828a05b79d7a897cdbc6c8729708c31eefb6843edbd208d23ac9b27c8f"},"schema_version":"1.0","source":{"id":"1812.08092","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1812.08092","created_at":"2026-05-17T23:57:54Z"},{"alias_kind":"arxiv_version","alias_value":"1812.08092v1","created_at":"2026-05-17T23:57:54Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1812.08092","created_at":"2026-05-17T23:57:54Z"},{"alias_kind":"pith_short_12","alias_value":"I5PZIUE6XGVF","created_at":"2026-05-18T12:32:28Z"},{"alias_kind":"pith_short_16","alias_value":"I5PZIUE6XGVFEH4F","created_at":"2026-05-18T12:32:28Z"},{"alias_kind":"pith_short_8","alias_value":"I5PZIUE6","created_at":"2026-05-18T12:32:28Z"}],"graph_snapshots":[{"event_id":"sha256:adaed4a6da556384d71aa254aa812724179a40b2ec4402139b2d93c913d78f77","target":"graph","created_at":"2026-05-17T23:57:54Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"The use of Project Gutenberg (PG) as a text corpus has been extremely popular in statistical analysis of language for more than 25 years. However, in contrast to other major linguistic datasets of similar importance, no consensual full version of PG exists to date. In fact, most PG studies so far either consider only a small number of manually selected books, leading to potential biased subsets, or employ vastly different pre-processing strategies (often specified in insufficient details), raising concerns regarding the reproducibility of published results. In order to address these shortcomin","authors_text":"Francesc Font-Clos, Martin Gerlach","cross_cats":["cs.DL","cs.IR","physics.soc-ph"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-12-19T17:10:14Z","title":"A standardized Project Gutenberg corpus for statistical analysis of natural language and quantitative linguistics"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1812.08092","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:74983a0a95adcbc914d4064ff182679aa24c508407cb5aa8619b4f2e6a784a45","target":"record","created_at":"2026-05-17T23:57:54Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"4ef2965a641ca44609d7dcc872dfdf7b59efb1208e8d2d949c2258dd3a24cb9d","cross_cats_sorted":["cs.DL","cs.IR","physics.soc-ph"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-12-19T17:10:14Z","title_canon_sha256":"78880c828a05b79d7a897cdbc6c8729708c31eefb6843edbd208d23ac9b27c8f"},"schema_version":"1.0","source":{"id":"1812.08092","kind":"arxiv","version":1}},"canonical_sha256":"475f94509eb9aa521f85de45ebb7dcddc72646b0a50c1876a94163e5143631d1","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"475f94509eb9aa521f85de45ebb7dcddc72646b0a50c1876a94163e5143631d1","first_computed_at":"2026-05-17T23:57:54.929989Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:57:54.929989Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"wrTsKWFdGeDt2ETl8K6+pPJKRNlBAoxSpVhFTOjZnmCdP8m4xq9h0Axex0KmQrl0eEM5oYI1ju0zZW5CstfvBQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:57:54.930673Z","signed_message":"canonical_sha256_bytes"},"source_id":"1812.08092","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:74983a0a95adcbc914d4064ff182679aa24c508407cb5aa8619b4f2e6a784a45","sha256:adaed4a6da556384d71aa254aa812724179a40b2ec4402139b2d93c913d78f77"],"state_sha256":"e5b160fdd2d037850a16a8314b84cb1249d9c59dafd25f4049459df642334fac"}