{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:NXVY4YRQDFTWU6TIKJ3QWSRGPV","short_pith_number":"pith:NXVY4YRQ","schema_version":"1.0","canonical_sha256":"6deb8e623019676a7a6852770b4a267d52a399a37d3aa258a5cd3d0f448e2591","source":{"kind":"arxiv","id":"1904.05780","version":1},"attestation_state":"computed","paper":{"title":"Corpora Generation for Grammatical Error Correction","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.CL","authors_text":"Chris Alberti, Jared Lichtarge, Niki Parmar, Noam Shazeer, Shankar Kumar, Simon Tong","submitted_at":"2019-04-10T05:47:15Z","abstract_excerpt":"Grammatical Error Correction (GEC) has been recently modeled using the sequence-to-sequence framework. However, unlike sequence transduction problems such as machine translation, GEC suffers from the lack of plentiful parallel data. We describe two approaches for generating large parallel datasets for GEC using publicly available Wikipedia data. The first method extracts source-target pairs from Wikipedia edit histories with minimal filtration heuristics, while the second method introduces noise into Wikipedia sentences via round-trip translation through bridge languages. Both strategies yield"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1904.05780","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-04-10T05:47:15Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"8b7c94e828717420a04eac4c653d7f4e7a3979866792ac6f2bc2722e8fce3bb8","abstract_canon_sha256":"644f63d27db3ddda62744c121c3dcffc83b66db1e051b89e6804993bc65da8bc"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:48:48.470045Z","signature_b64":"eLacN1BjKQwXuMLg5EEhCf+IYAVpw3IUIKILywFfpmr5FCDtddboVaeMEZEJi2J54LH96vaWmwL5bW1AwtSJCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6deb8e623019676a7a6852770b4a267d52a399a37d3aa258a5cd3d0f448e2591","last_reissued_at":"2026-05-17T23:48:48.469493Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:48:48.469493Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Corpora Generation for Grammatical Error Correction","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.CL","authors_text":"Chris Alberti, Jared Lichtarge, Niki Parmar, Noam Shazeer, Shankar Kumar, Simon Tong","submitted_at":"2019-04-10T05:47:15Z","abstract_excerpt":"Grammatical Error Correction (GEC) has been recently modeled using the sequence-to-sequence framework. However, unlike sequence transduction problems such as machine translation, GEC suffers from the lack of plentiful parallel data. We describe two approaches for generating large parallel datasets for GEC using publicly available Wikipedia data. The first method extracts source-target pairs from Wikipedia edit histories with minimal filtration heuristics, while the second method introduces noise into Wikipedia sentences via round-trip translation through bridge languages. Both strategies yield"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1904.05780","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1904.05780","created_at":"2026-05-17T23:48:48.469589+00:00"},{"alias_kind":"arxiv_version","alias_value":"1904.05780v1","created_at":"2026-05-17T23:48:48.469589+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1904.05780","created_at":"2026-05-17T23:48:48.469589+00:00"},{"alias_kind":"pith_short_12","alias_value":"NXVY4YRQDFTW","created_at":"2026-05-18T12:33:24.271573+00:00"},{"alias_kind":"pith_short_16","alias_value":"NXVY4YRQDFTWU6TI","created_at":"2026-05-18T12:33:24.271573+00:00"},{"alias_kind":"pith_short_8","alias_value":"NXVY4YRQ","created_at":"2026-05-18T12:33:24.271573+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2604.23627","citing_title":"Neural Grammatical Error Correction for Romanian","ref_index":23,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/NXVY4YRQDFTWU6TIKJ3QWSRGPV","json":"https://pith.science/pith/NXVY4YRQDFTWU6TIKJ3QWSRGPV.json","graph_json":"https://pith.science/api/pith-number/NXVY4YRQDFTWU6TIKJ3QWSRGPV/graph.json","events_json":"https://pith.science/api/pith-number/NXVY4YRQDFTWU6TIKJ3QWSRGPV/events.json","paper":"https://pith.science/paper/NXVY4YRQ"},"agent_actions":{"view_html":"https://pith.science/pith/NXVY4YRQDFTWU6TIKJ3QWSRGPV","download_json":"https://pith.science/pith/NXVY4YRQDFTWU6TIKJ3QWSRGPV.json","view_paper":"https://pith.science/paper/NXVY4YRQ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1904.05780&json=true","fetch_graph":"https://pith.science/api/pith-number/NXVY4YRQDFTWU6TIKJ3QWSRGPV/graph.json","fetch_events":"https://pith.science/api/pith-number/NXVY4YRQDFTWU6TIKJ3QWSRGPV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/NXVY4YRQDFTWU6TIKJ3QWSRGPV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/NXVY4YRQDFTWU6TIKJ3QWSRGPV/action/storage_attestation","attest_author":"https://pith.science/pith/NXVY4YRQDFTWU6TIKJ3QWSRGPV/action/author_attestation","sign_citation":"https://pith.science/pith/NXVY4YRQDFTWU6TIKJ3QWSRGPV/action/citation_signature","submit_replication":"https://pith.science/pith/NXVY4YRQDFTWU6TIKJ3QWSRGPV/action/replication_record"}},"created_at":"2026-05-17T23:48:48.469589+00:00","updated_at":"2026-05-17T23:48:48.469589+00:00"}