{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2019:U32ILPHTC4M5QSEYWADAQCZJUJ","short_pith_number":"pith:U32ILPHT","canonical_record":{"source":{"id":"1906.11751","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2019-06-27T15:53:58Z","cross_cats_sorted":[],"title_canon_sha256":"4773e775e84b49c636b172fcb372d6ee0e82c4027e5907a36b197a400cac9fbc","abstract_canon_sha256":"7a5fe6b565ce21a6f1dcf2e5eca46b3a1a09a2ad94b102a39117fc3d2a74476f"},"schema_version":"1.0"},"canonical_sha256":"a6f485bcf31719d84898b006080b29a27aa7fc6d6475d1fa915b1d5c2176aa7e","source":{"kind":"arxiv","id":"1906.11751","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1906.11751","created_at":"2026-05-17T23:42:03Z"},{"alias_kind":"arxiv_version","alias_value":"1906.11751v1","created_at":"2026-05-17T23:42:03Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1906.11751","created_at":"2026-05-17T23:42:03Z"},{"alias_kind":"pith_short_12","alias_value":"U32ILPHTC4M5","created_at":"2026-05-18T12:33:30Z"},{"alias_kind":"pith_short_16","alias_value":"U32ILPHTC4M5QSEY","created_at":"2026-05-18T12:33:30Z"},{"alias_kind":"pith_short_8","alias_value":"U32ILPHT","created_at":"2026-05-18T12:33:30Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2019:U32ILPHTC4M5QSEYWADAQCZJUJ","target":"record","payload":{"canonical_record":{"source":{"id":"1906.11751","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2019-06-27T15:53:58Z","cross_cats_sorted":[],"title_canon_sha256":"4773e775e84b49c636b172fcb372d6ee0e82c4027e5907a36b197a400cac9fbc","abstract_canon_sha256":"7a5fe6b565ce21a6f1dcf2e5eca46b3a1a09a2ad94b102a39117fc3d2a74476f"},"schema_version":"1.0"},"canonical_sha256":"a6f485bcf31719d84898b006080b29a27aa7fc6d6475d1fa915b1d5c2176aa7e","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:42:03.882993Z","signature_b64":"sM+3avzFKfXGgjxzZSBqphmIawMgVpDRnPPAKUFvGbV0zA5y3FxWRvlVNk6dHfAZuT7qol6Mb3agpzLXiUg7Cg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a6f485bcf31719d84898b006080b29a27aa7fc6d6475d1fa915b1d5c2176aa7e","last_reissued_at":"2026-05-17T23:42:03.882405Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:42:03.882405Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1906.11751","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:42:03Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"I8Q7PP6Or+tAbHngtyrUlO8edorUODhcXOl3IIKIZ1NOL90pAeFmJ644mptji+bX8pWNPz8Fk8XLPumAjFqWCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T07:54:24.815343Z"},"content_sha256":"50cf7d027d0b8893f1efd00aa1fc04fb28cfa4ad98d8ce60e31b0bf1744504b9","schema_version":"1.0","event_id":"sha256:50cf7d027d0b8893f1efd00aa1fc04fb28cfa4ad98d8ce60e31b0bf1744504b9"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2019:U32ILPHTC4M5QSEYWADAQCZJUJ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"The Impact of Preprocessing on Arabic-English Statistical and Neural Machine Translation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Amjad Almahairi, Mai Oudah, Nizar Habash","submitted_at":"2019-06-27T15:53:58Z","abstract_excerpt":"Neural networks have become the state-of-the-art approach for machine translation (MT) in many languages. While linguistically-motivated tokenization techniques were shown to have significant effects on the performance of statistical MT, it remains unclear if those techniques are well suited for neural MT. In this paper, we systematically compare neural and statistical MT models for Arabic-English translation on data preprecossed by various prominent tokenization schemes. Furthermore, we consider a range of data and vocabulary sizes and compare their effect on both approaches. Our empirical re"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1906.11751","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:42:03Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"FTkyp5nSAZc9XocuQHF8S719kZjtGP3h1tjc9xLaTZOZrpVplDuz49OAUoUNoA/1+WwrsLTZCjXhsJvSbszYBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T07:54:24.815688Z"},"content_sha256":"811f13a7c63fb77095c6861db3c4991c4850f93ffcca7c29be0cb350206b95e2","schema_version":"1.0","event_id":"sha256:811f13a7c63fb77095c6861db3c4991c4850f93ffcca7c29be0cb350206b95e2"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/U32ILPHTC4M5QSEYWADAQCZJUJ/bundle.json","state_url":"https://pith.science/pith/U32ILPHTC4M5QSEYWADAQCZJUJ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/U32ILPHTC4M5QSEYWADAQCZJUJ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T07:54:24Z","links":{"resolver":"https://pith.science/pith/U32ILPHTC4M5QSEYWADAQCZJUJ","bundle":"https://pith.science/pith/U32ILPHTC4M5QSEYWADAQCZJUJ/bundle.json","state":"https://pith.science/pith/U32ILPHTC4M5QSEYWADAQCZJUJ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/U32ILPHTC4M5QSEYWADAQCZJUJ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2019:U32ILPHTC4M5QSEYWADAQCZJUJ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7a5fe6b565ce21a6f1dcf2e5eca46b3a1a09a2ad94b102a39117fc3d2a74476f","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2019-06-27T15:53:58Z","title_canon_sha256":"4773e775e84b49c636b172fcb372d6ee0e82c4027e5907a36b197a400cac9fbc"},"schema_version":"1.0","source":{"id":"1906.11751","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1906.11751","created_at":"2026-05-17T23:42:03Z"},{"alias_kind":"arxiv_version","alias_value":"1906.11751v1","created_at":"2026-05-17T23:42:03Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1906.11751","created_at":"2026-05-17T23:42:03Z"},{"alias_kind":"pith_short_12","alias_value":"U32ILPHTC4M5","created_at":"2026-05-18T12:33:30Z"},{"alias_kind":"pith_short_16","alias_value":"U32ILPHTC4M5QSEY","created_at":"2026-05-18T12:33:30Z"},{"alias_kind":"pith_short_8","alias_value":"U32ILPHT","created_at":"2026-05-18T12:33:30Z"}],"graph_snapshots":[{"event_id":"sha256:811f13a7c63fb77095c6861db3c4991c4850f93ffcca7c29be0cb350206b95e2","target":"graph","created_at":"2026-05-17T23:42:03Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Neural networks have become the state-of-the-art approach for machine translation (MT) in many languages. While linguistically-motivated tokenization techniques were shown to have significant effects on the performance of statistical MT, it remains unclear if those techniques are well suited for neural MT. In this paper, we systematically compare neural and statistical MT models for Arabic-English translation on data preprecossed by various prominent tokenization schemes. Furthermore, we consider a range of data and vocabulary sizes and compare their effect on both approaches. Our empirical re","authors_text":"Amjad Almahairi, Mai Oudah, Nizar Habash","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2019-06-27T15:53:58Z","title":"The Impact of Preprocessing on Arabic-English Statistical and Neural Machine Translation"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1906.11751","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:50cf7d027d0b8893f1efd00aa1fc04fb28cfa4ad98d8ce60e31b0bf1744504b9","target":"record","created_at":"2026-05-17T23:42:03Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7a5fe6b565ce21a6f1dcf2e5eca46b3a1a09a2ad94b102a39117fc3d2a74476f","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2019-06-27T15:53:58Z","title_canon_sha256":"4773e775e84b49c636b172fcb372d6ee0e82c4027e5907a36b197a400cac9fbc"},"schema_version":"1.0","source":{"id":"1906.11751","kind":"arxiv","version":1}},"canonical_sha256":"a6f485bcf31719d84898b006080b29a27aa7fc6d6475d1fa915b1d5c2176aa7e","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"a6f485bcf31719d84898b006080b29a27aa7fc6d6475d1fa915b1d5c2176aa7e","first_computed_at":"2026-05-17T23:42:03.882405Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:42:03.882405Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"sM+3avzFKfXGgjxzZSBqphmIawMgVpDRnPPAKUFvGbV0zA5y3FxWRvlVNk6dHfAZuT7qol6Mb3agpzLXiUg7Cg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:42:03.882993Z","signed_message":"canonical_sha256_bytes"},"source_id":"1906.11751","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:50cf7d027d0b8893f1efd00aa1fc04fb28cfa4ad98d8ce60e31b0bf1744504b9","sha256:811f13a7c63fb77095c6861db3c4991c4850f93ffcca7c29be0cb350206b95e2"],"state_sha256":"4c598b8960ce0cd220fdb13ad7fbf0ad5cf26fe40c50707b39e7a7be425fd8c3"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"j/PL93iX17qv8rounO9t4Qx2N/R7POrUbi68yMvExyJYiVQE6MOfdnHtFUCtdoYI2jZHQaSkVdcOmCkmFBEGAA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T07:54:24.817780Z","bundle_sha256":"0f7b180eaa40b2c08e91ed05022fc530b0356b91ab095df6fb77b909eec62e94"}}