{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:WYEU6VUT5E2RQMPBD2FZBUMU7V","short_pith_number":"pith:WYEU6VUT","schema_version":"1.0","canonical_sha256":"b6094f5693e9351831e11e8b90d194fd5cab4542d1ab303556ca4983034cb103","source":{"kind":"arxiv","id":"1809.00197","version":2},"attestation_state":"computed","paper":{"title":"Dual Conditional Cross-Entropy Filtering of Noisy Parallel Corpora","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Marcin Junczys-Dowmunt","submitted_at":"2018-09-01T14:38:16Z","abstract_excerpt":"In this work we introduce dual conditional cross-entropy filtering for noisy parallel data. For each sentence pair of the noisy parallel corpus we compute cross-entropy scores according to two inverse translation models trained on clean data. We penalize divergent cross-entropies and weigh the penalty by the cross-entropy average of both models. Sorting or thresholding according to these scores results in better subsets of parallel data. We achieve higher BLEU scores with models trained on parallel data filtered only from Paracrawl than with models trained on clean WMT data. We further evaluat"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1809.00197","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-09-01T14:38:16Z","cross_cats_sorted":[],"title_canon_sha256":"9dd9580f9ba4ca8a26ab12906eeaea62a0b8755d19af820a3bc4240a18544e03","abstract_canon_sha256":"fdc91f9256c2bcf50913b715add9a7aa12a7f765b38f7a9edc5e802bef394e0e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:54:22.593549Z","signature_b64":"yrlPUCo9aL7kkJkFroT0m7IyRKRzb+X5sDgDxqcDeZh0WONWcsBsI7IXgH3huJa8a71lLeMKzWohKIEkFZFaAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b6094f5693e9351831e11e8b90d194fd5cab4542d1ab303556ca4983034cb103","last_reissued_at":"2026-05-17T23:54:22.593060Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:54:22.593060Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Dual Conditional Cross-Entropy Filtering of Noisy Parallel Corpora","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Marcin Junczys-Dowmunt","submitted_at":"2018-09-01T14:38:16Z","abstract_excerpt":"In this work we introduce dual conditional cross-entropy filtering for noisy parallel data. For each sentence pair of the noisy parallel corpus we compute cross-entropy scores according to two inverse translation models trained on clean data. We penalize divergent cross-entropies and weigh the penalty by the cross-entropy average of both models. Sorting or thresholding according to these scores results in better subsets of parallel data. We achieve higher BLEU scores with models trained on parallel data filtered only from Paracrawl than with models trained on clean WMT data. We further evaluat"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1809.00197","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1809.00197","created_at":"2026-05-17T23:54:22.593125+00:00"},{"alias_kind":"arxiv_version","alias_value":"1809.00197v2","created_at":"2026-05-17T23:54:22.593125+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1809.00197","created_at":"2026-05-17T23:54:22.593125+00:00"},{"alias_kind":"pith_short_12","alias_value":"WYEU6VUT5E2R","created_at":"2026-05-18T12:33:01.666342+00:00"},{"alias_kind":"pith_short_16","alias_value":"WYEU6VUT5E2RQMPB","created_at":"2026-05-18T12:33:01.666342+00:00"},{"alias_kind":"pith_short_8","alias_value":"WYEU6VUT","created_at":"2026-05-18T12:33:01.666342+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WYEU6VUT5E2RQMPBD2FZBUMU7V","json":"https://pith.science/pith/WYEU6VUT5E2RQMPBD2FZBUMU7V.json","graph_json":"https://pith.science/api/pith-number/WYEU6VUT5E2RQMPBD2FZBUMU7V/graph.json","events_json":"https://pith.science/api/pith-number/WYEU6VUT5E2RQMPBD2FZBUMU7V/events.json","paper":"https://pith.science/paper/WYEU6VUT"},"agent_actions":{"view_html":"https://pith.science/pith/WYEU6VUT5E2RQMPBD2FZBUMU7V","download_json":"https://pith.science/pith/WYEU6VUT5E2RQMPBD2FZBUMU7V.json","view_paper":"https://pith.science/paper/WYEU6VUT","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1809.00197&json=true","fetch_graph":"https://pith.science/api/pith-number/WYEU6VUT5E2RQMPBD2FZBUMU7V/graph.json","fetch_events":"https://pith.science/api/pith-number/WYEU6VUT5E2RQMPBD2FZBUMU7V/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WYEU6VUT5E2RQMPBD2FZBUMU7V/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WYEU6VUT5E2RQMPBD2FZBUMU7V/action/storage_attestation","attest_author":"https://pith.science/pith/WYEU6VUT5E2RQMPBD2FZBUMU7V/action/author_attestation","sign_citation":"https://pith.science/pith/WYEU6VUT5E2RQMPBD2FZBUMU7V/action/citation_signature","submit_replication":"https://pith.science/pith/WYEU6VUT5E2RQMPBD2FZBUMU7V/action/replication_record"}},"created_at":"2026-05-17T23:54:22.593125+00:00","updated_at":"2026-05-17T23:54:22.593125+00:00"}