{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2016:YLLO6PI2FFLCXXI4GK5RU3XDF3","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"b185ac5d3489fa7f7d776d2422deb7832169b2493943abe2685c205d3445f199","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-08-28T17:51:41Z","title_canon_sha256":"b59fb599bdf8936f3ae15fc8e086a214d99bf4b5e3e9cbec3bb1efe2c5208148"},"schema_version":"1.0","source":{"id":"1608.07836","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1608.07836","created_at":"2026-05-18T01:07:49Z"},{"alias_kind":"arxiv_version","alias_value":"1608.07836v1","created_at":"2026-05-18T01:07:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1608.07836","created_at":"2026-05-18T01:07:49Z"},{"alias_kind":"pith_short_12","alias_value":"YLLO6PI2FFLC","created_at":"2026-05-18T12:30:53Z"},{"alias_kind":"pith_short_16","alias_value":"YLLO6PI2FFLCXXI4","created_at":"2026-05-18T12:30:53Z"},{"alias_kind":"pith_short_8","alias_value":"YLLO6PI2","created_at":"2026-05-18T12:30:53Z"}],"graph_snapshots":[{"event_id":"sha256:7024523a65ab0fe8a9053d9cd168b5d94aefe723efbe6bc537e8df35a2063330","target":"graph","created_at":"2026-05-18T01:07:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Real world data differs radically from the benchmark corpora we use in natural language processing (NLP). As soon as we apply our technologies to the real world, performance drops. The reason for this problem is obvious: NLP models are trained on samples from a limited set of canonical varieties that are considered standard, most prominently English newswire. However, there are many dimensions, e.g., socio-demographics, language, genre, sentence type, etc. on which texts can differ from the standard. The solution is not obvious: we cannot control for all factors, and it is not clear how to bes","authors_text":"Barbara Plank","cross_cats":[],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-08-28T17:51:41Z","title":"What to do about non-standard (or non-canonical) language in NLP"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1608.07836","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:3662941a52e101b63fb3f6c35d485956e53c775530d70fa26bbf66e4e495bb2e","target":"record","created_at":"2026-05-18T01:07:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"b185ac5d3489fa7f7d776d2422deb7832169b2493943abe2685c205d3445f199","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-08-28T17:51:41Z","title_canon_sha256":"b59fb599bdf8936f3ae15fc8e086a214d99bf4b5e3e9cbec3bb1efe2c5208148"},"schema_version":"1.0","source":{"id":"1608.07836","kind":"arxiv","version":1}},"canonical_sha256":"c2d6ef3d1a29562bdd1c32bb1a6ee32ee5f60af1d0e4b15a52e28301f7610e9b","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c2d6ef3d1a29562bdd1c32bb1a6ee32ee5f60af1d0e4b15a52e28301f7610e9b","first_computed_at":"2026-05-18T01:07:49.795926Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T01:07:49.795926Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"m15JGhuYr3mQnH/7esN+rYBPRZuobmwa1zUryi9ew6kSOpNOPiHVNKOsNVO5WJdpsqrVcUYMYrfAbp0pU4vvBw==","signature_status":"signed_v1","signed_at":"2026-05-18T01:07:49.796472Z","signed_message":"canonical_sha256_bytes"},"source_id":"1608.07836","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:3662941a52e101b63fb3f6c35d485956e53c775530d70fa26bbf66e4e495bb2e","sha256:7024523a65ab0fe8a9053d9cd168b5d94aefe723efbe6bc537e8df35a2063330"],"state_sha256":"69954ae22536b0635110dde748f28467125e480e554778a9d922e10e7201d90b"}