{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:L7AEHQT4W6H7ZPPCNBQLYZJ3D2","short_pith_number":"pith:L7AEHQT4","schema_version":"1.0","canonical_sha256":"5fc043c27cb78ffcbde26860bc653b1e93f45bc6597163c4f56e3a4db7c59ce2","source":{"kind":"arxiv","id":"1906.08237","version":2},"attestation_state":"computed","paper":{"title":"XLNet: Generalized Autoregressive Pretraining for Language Understanding","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov, Yiming Yang, Zhilin Yang, Zihang Dai","submitted_at":"2019-06-19T17:35:48Z","abstract_excerpt":"With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves better performance than pretraining approaches based on autoregressive language modeling. However, relying on corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over all permutations of the factorization order an"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1906.08237","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2019-06-19T17:35:48Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"a9d79bdfaed226308c2634d61e66079943bccf6c431d0d4786c951648877d2c2","abstract_canon_sha256":"b55e1b13fcd8ff4f7746ebdd7cd80bb494eecc7398456dd01de0b3c3c7d3d361"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:24:07.455628Z","signature_b64":"aS5vIJ0Q8ep7TE2tQpXY5lCPsXIjM0YFfZ2OgSIaUrumepMhNDRgLjV9v3fP1/5sWVSthjss8vsF06+g+PKPDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5fc043c27cb78ffcbde26860bc653b1e93f45bc6597163c4f56e3a4db7c59ce2","last_reissued_at":"2026-05-18T01:24:07.454943Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:24:07.454943Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"XLNet: Generalized Autoregressive Pretraining for Language Understanding","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov, Yiming Yang, Zhilin Yang, Zihang Dai","submitted_at":"2019-06-19T17:35:48Z","abstract_excerpt":"With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves better performance than pretraining approaches based on autoregressive language modeling. However, relying on corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over all permutations of the factorization order an"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1906.08237","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1906.08237","created_at":"2026-05-18T01:24:07.455026+00:00"},{"alias_kind":"arxiv_version","alias_value":"1906.08237v2","created_at":"2026-05-18T01:24:07.455026+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1906.08237","created_at":"2026-05-18T01:24:07.455026+00:00"},{"alias_kind":"pith_short_12","alias_value":"L7AEHQT4W6H7","created_at":"2026-05-18T12:33:21.387695+00:00"},{"alias_kind":"pith_short_16","alias_value":"L7AEHQT4W6H7ZPPC","created_at":"2026-05-18T12:33:21.387695+00:00"},{"alias_kind":"pith_short_8","alias_value":"L7AEHQT4","created_at":"2026-05-18T12:33:21.387695+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":17,"internal_anchor_count":17,"sample":[{"citing_arxiv_id":"1911.05507","citing_title":"Compressive Transformers for Long-Range Sequence Modelling","ref_index":147,"is_internal_anchor":true},{"citing_arxiv_id":"2102.01293","citing_title":"Scaling Laws for Transfer","ref_index":93,"is_internal_anchor":true},{"citing_arxiv_id":"2009.08366","citing_title":"GraphCodeBERT: Pre-training Code Representations with Data Flow","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2002.08910","citing_title":"How Much Knowledge Can You Pack Into the Parameters of a Language Model?","ref_index":74,"is_internal_anchor":true},{"citing_arxiv_id":"2002.08155","citing_title":"CodeBERT: A Pre-Trained Model for Programming and Natural Languages","ref_index":62,"is_internal_anchor":true},{"citing_arxiv_id":"1910.13461","citing_title":"BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"1910.10683","citing_title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","ref_index":79,"is_internal_anchor":true},{"citing_arxiv_id":"1910.03771","citing_title":"HuggingFace's Transformers: State-of-the-art Natural Language Processing","ref_index":188,"is_internal_anchor":true},{"citing_arxiv_id":"2112.00861","citing_title":"A General Language Assistant as a Laboratory for Alignment","ref_index":125,"is_internal_anchor":true},{"citing_arxiv_id":"2604.20666","citing_title":"ORPHEAS: A Cross-Lingual Greek-English Embedding Model for Retrieval-Augmented Generation","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2205.01068","citing_title":"OPT: Open Pre-trained Transformer Language Models","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"1909.08053","citing_title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2207.05221","citing_title":"Language Models (Mostly) Know What They Know","ref_index":183,"is_internal_anchor":true},{"citing_arxiv_id":"1908.10084","citing_title":"Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2005.14165","citing_title":"Language Models are Few-Shot Learners","ref_index":82,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15499","citing_title":"SecureRouter: Encrypted Routing for Efficient Secure Inference","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"1907.11692","citing_title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach","ref_index":48,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/L7AEHQT4W6H7ZPPCNBQLYZJ3D2","json":"https://pith.science/pith/L7AEHQT4W6H7ZPPCNBQLYZJ3D2.json","graph_json":"https://pith.science/api/pith-number/L7AEHQT4W6H7ZPPCNBQLYZJ3D2/graph.json","events_json":"https://pith.science/api/pith-number/L7AEHQT4W6H7ZPPCNBQLYZJ3D2/events.json","paper":"https://pith.science/paper/L7AEHQT4"},"agent_actions":{"view_html":"https://pith.science/pith/L7AEHQT4W6H7ZPPCNBQLYZJ3D2","download_json":"https://pith.science/pith/L7AEHQT4W6H7ZPPCNBQLYZJ3D2.json","view_paper":"https://pith.science/paper/L7AEHQT4","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1906.08237&json=true","fetch_graph":"https://pith.science/api/pith-number/L7AEHQT4W6H7ZPPCNBQLYZJ3D2/graph.json","fetch_events":"https://pith.science/api/pith-number/L7AEHQT4W6H7ZPPCNBQLYZJ3D2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/L7AEHQT4W6H7ZPPCNBQLYZJ3D2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/L7AEHQT4W6H7ZPPCNBQLYZJ3D2/action/storage_attestation","attest_author":"https://pith.science/pith/L7AEHQT4W6H7ZPPCNBQLYZJ3D2/action/author_attestation","sign_citation":"https://pith.science/pith/L7AEHQT4W6H7ZPPCNBQLYZJ3D2/action/citation_signature","submit_replication":"https://pith.science/pith/L7AEHQT4W6H7ZPPCNBQLYZJ3D2/action/replication_record"}},"created_at":"2026-05-18T01:24:07.455026+00:00","updated_at":"2026-05-18T01:24:07.455026+00:00"}