{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:U77OXXT7EOISGA4TXCT5RV4XV7","short_pith_number":"pith:U77OXXT7","schema_version":"1.0","canonical_sha256":"a7feebde7f2391230393b8a7d8d797afea150d456ca1a9696d2589ed53a8e096","source":{"kind":"arxiv","id":"1907.05791","version":2},"attestation_state":"computed","paper":{"title":"WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Francisco Guzm\\'an, Holger Schwenk, Hongyu Gong, Shuo Sun, Vishrav Chaudhary","submitted_at":"2019-07-10T23:57:30Z","abstract_excerpt":"We present an approach based on multilingual sentence embeddings to automatically extract parallel sentences from the content of Wikipedia articles in 85 languages, including several dialects or low-resource languages. We do not limit the the extraction process to alignments with English, but systematically consider all possible language pairs. In total, we are able to extract 135M parallel sentences for 1620 different language pairs, out of which only 34M are aligned with English. This corpus of parallel sentences is freely available at https://github.com/facebookresearch/LASER/tree/master/ta"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1907.05791","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-07-10T23:57:30Z","cross_cats_sorted":[],"title_canon_sha256":"2417263109f293d31a702bcddca0c143d633e3db74e73c442e8bd9fd9bc3c208","abstract_canon_sha256":"bcf9c12dc0728931019521e7365176f23ba6d37dd69ff93b8256bdc0770f5c1d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:40:29.539028Z","signature_b64":"ATqR0RqOzKFZScCxXGSZdkXcbegEUrbSPWTnGKKgO1M06NRFZAyBG18sAMmIYXMLSRbv2eKpfnLE0AQzDzRlDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a7feebde7f2391230393b8a7d8d797afea150d456ca1a9696d2589ed53a8e096","last_reissued_at":"2026-05-17T23:40:29.538369Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:40:29.538369Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Francisco Guzm\\'an, Holger Schwenk, Hongyu Gong, Shuo Sun, Vishrav Chaudhary","submitted_at":"2019-07-10T23:57:30Z","abstract_excerpt":"We present an approach based on multilingual sentence embeddings to automatically extract parallel sentences from the content of Wikipedia articles in 85 languages, including several dialects or low-resource languages. We do not limit the the extraction process to alignments with English, but systematically consider all possible language pairs. In total, we are able to extract 135M parallel sentences for 1620 different language pairs, out of which only 34M are aligned with English. This corpus of parallel sentences is freely available at https://github.com/facebookresearch/LASER/tree/master/ta"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1907.05791","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1907.05791","created_at":"2026-05-17T23:40:29.538465+00:00"},{"alias_kind":"arxiv_version","alias_value":"1907.05791v2","created_at":"2026-05-17T23:40:29.538465+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1907.05791","created_at":"2026-05-17T23:40:29.538465+00:00"},{"alias_kind":"pith_short_12","alias_value":"U77OXXT7EOIS","created_at":"2026-05-18T12:33:30.264802+00:00"},{"alias_kind":"pith_short_16","alias_value":"U77OXXT7EOISGA4T","created_at":"2026-05-18T12:33:30.264802+00:00"},{"alias_kind":"pith_short_8","alias_value":"U77OXXT7","created_at":"2026-05-18T12:33:30.264802+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2605.09476","citing_title":"Align and Shine: Building High-Quality Sentence-Aligned Corpora for Multilingual Text Simplification","ref_index":17,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/U77OXXT7EOISGA4TXCT5RV4XV7","json":"https://pith.science/pith/U77OXXT7EOISGA4TXCT5RV4XV7.json","graph_json":"https://pith.science/api/pith-number/U77OXXT7EOISGA4TXCT5RV4XV7/graph.json","events_json":"https://pith.science/api/pith-number/U77OXXT7EOISGA4TXCT5RV4XV7/events.json","paper":"https://pith.science/paper/U77OXXT7"},"agent_actions":{"view_html":"https://pith.science/pith/U77OXXT7EOISGA4TXCT5RV4XV7","download_json":"https://pith.science/pith/U77OXXT7EOISGA4TXCT5RV4XV7.json","view_paper":"https://pith.science/paper/U77OXXT7","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1907.05791&json=true","fetch_graph":"https://pith.science/api/pith-number/U77OXXT7EOISGA4TXCT5RV4XV7/graph.json","fetch_events":"https://pith.science/api/pith-number/U77OXXT7EOISGA4TXCT5RV4XV7/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/U77OXXT7EOISGA4TXCT5RV4XV7/action/timestamp_anchor","attest_storage":"https://pith.science/pith/U77OXXT7EOISGA4TXCT5RV4XV7/action/storage_attestation","attest_author":"https://pith.science/pith/U77OXXT7EOISGA4TXCT5RV4XV7/action/author_attestation","sign_citation":"https://pith.science/pith/U77OXXT7EOISGA4TXCT5RV4XV7/action/citation_signature","submit_replication":"https://pith.science/pith/U77OXXT7EOISGA4TXCT5RV4XV7/action/replication_record"}},"created_at":"2026-05-17T23:40:29.538465+00:00","updated_at":"2026-05-17T23:40:29.538465+00:00"}