{"work":{"id":"41fe12c4-e538-4890-a244-480650ed3078","openalex_id":null,"doi":null,"arxiv_id":"1907.11692","raw_key":null,"title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach","authors":null,"authors_text":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen","year":2019,"venue":"cs.CL","abstract":"Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results highlight the importance of previously overlooked design choices, and raise questions about the source of recently reported improvements. We release our models and code.","external_url":"https://arxiv.org/abs/1907.11692","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T08:10:31.435190+00:00","pith_arxiv_id":"1907.11692","created_at":"2026-05-08T16:53:29.104086+00:00","updated_at":"2026-05-25T08:10:31.435190+00:00","title_quality_ok":true,"display_title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach","render_title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach"},"hub":{"state":{"work_id":"41fe12c4-e538-4890-a244-480650ed3078","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":395,"external_cited_by_count":null,"distinct_field_count":21,"first_pith_cited_at":"2019-06-19T17:35:48+00:00","last_pith_cited_at":"2026-05-22T13:55:11+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-29T21:40:30.257885+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":45},{"context_role":"method","n":12},{"context_role":"baseline","n":5},{"context_role":"dataset","n":3}],"polarity_counts":[{"context_polarity":"background","n":41},{"context_polarity":"use_method","n":12},{"context_polarity":"baseline","n":5},{"context_polarity":"support","n":3},{"context_polarity":"use_dataset","n":3},{"context_polarity":"unclear","n":1}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach","claims":[{"claim_text":"Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks RoBERTa: A Robustly Optimized BERT Pretraining Approach because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T19:53:36.961746+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"37d3c70e-f094-44d4-a128-a97f09cc7374","orcid":null,"display_name":"Yinhan Liu"},{"id":"737c641a-b7d9-40db-a7ba-218dc54c500d","orcid":null,"display_name":"Myle Ott"},{"id":"63b793df-28cb-4279-a2b8-3385a04bd3d4","orcid":null,"display_name":"Naman Goyal"},{"id":"5ede1007-9973-4452-ba48-c78827d3385b","orcid":null,"display_name":"Jingfei Du"},{"id":"89203f73-bcbb-42d4-a175-dea3b337ba8b","orcid":null,"display_name":"Mandar Joshi"},{"id":"7234fb01-d92c-46ec-a5f6-b58f1f68dae7","orcid":null,"display_name":"Danqi Chen"}]},"error":null,"updated_at":"2026-05-13T19:53:36.959296+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T19:53:36.248777+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","work_id":"ed240a10-5b19-406c-baa5-30803f465785","shared_citers":38},{"title":"doi: 10.18653/v1/N19-1423","work_id":"3e3c8ac8-b858-4b22-af32-393d98c883e0","shared_citers":27},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":22},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":22},{"title":"DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter","work_id":"756f9764-ecd6-4672-8043-b37c698c7ad2","shared_citers":21},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":21},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":19},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":18},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":16},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":16},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":15},{"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","shared_citers":15},{"title":"Distilling the Knowledge in a Neural Network","work_id":"d927ab1f-17b8-4002-9d09-c3d55764fbad","shared_citers":14},{"title":"ALBERT: A Lite BERT for Self-supervised Learning of Language Representations","work_id":"aedf7950-7c35-4e28-a32d-bec290f51669","shared_citers":12},{"title":"A Survey of Large Language Models","work_id":"de1b42b5-4a0a-4b1f-8c78-1f7fe21be6c9","shared_citers":12},{"title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","work_id":"50e3b368-0243-4726-8186-233869802ad1","shared_citers":12},{"title":"Longformer: The Long-Document Transformer","work_id":"abea7a44-6668-4de7-aab6-f53a6e5aa088","shared_citers":12},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":12},{"title":"PaLM: Scaling Language Modeling with Pathways","work_id":"a94f3ef7-2c49-4445-93fe-6ec16aafd966","shared_citers":12},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":12},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":11},{"title":"DeBERTa: Decoding-enhanced BERT with Disentangled Attention","work_id":"cb3211f6-363f-4355-bb84-2ecfd7e78875","shared_citers":11},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":11},{"title":"OPT: Open Pre-trained Transformer Language Models","work_id":"d7ff3b21-1fff-4cf4-952a-4714e3ef2307","shared_citers":11}],"time_series":[{"n":6,"year":2019},{"n":6,"year":2020},{"n":3,"year":2021},{"n":9,"year":2022},{"n":12,"year":2023},{"n":3,"year":2024},{"n":1,"year":2025},{"n":156,"year":2026}]},"error":null,"updated_at":"2026-05-13T19:53:36.373532+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T19:53:35.309531+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach","claims":[{"claim_text":"Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks RoBERTa: A Robustly Optimized BERT Pretraining Approach because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T19:53:36.254349+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach","claims":[{"claim_text":"Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks RoBERTa: A Robustly Optimized BERT Pretraining Approach because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T19:53:36.252551+00:00"}},"summary":{"title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach","claims":[{"claim_text":"Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks RoBERTa: A Robustly Optimized BERT Pretraining Approach because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","work_id":"ed240a10-5b19-406c-baa5-30803f465785","shared_citers":38},{"title":"doi: 10.18653/v1/N19-1423","work_id":"3e3c8ac8-b858-4b22-af32-393d98c883e0","shared_citers":27},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":22},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":22},{"title":"DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter","work_id":"756f9764-ecd6-4672-8043-b37c698c7ad2","shared_citers":21},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":21},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":19},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":18},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":16},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":16},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":15},{"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","shared_citers":15},{"title":"Distilling the Knowledge in a Neural Network","work_id":"d927ab1f-17b8-4002-9d09-c3d55764fbad","shared_citers":14},{"title":"ALBERT: A Lite BERT for Self-supervised Learning of Language Representations","work_id":"aedf7950-7c35-4e28-a32d-bec290f51669","shared_citers":12},{"title":"A Survey of Large Language Models","work_id":"de1b42b5-4a0a-4b1f-8c78-1f7fe21be6c9","shared_citers":12},{"title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","work_id":"50e3b368-0243-4726-8186-233869802ad1","shared_citers":12},{"title":"Longformer: The Long-Document Transformer","work_id":"abea7a44-6668-4de7-aab6-f53a6e5aa088","shared_citers":12},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":12},{"title":"PaLM: Scaling Language Modeling with Pathways","work_id":"a94f3ef7-2c49-4445-93fe-6ec16aafd966","shared_citers":12},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":12},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":11},{"title":"DeBERTa: Decoding-enhanced BERT with Disentangled Attention","work_id":"cb3211f6-363f-4355-bb84-2ecfd7e78875","shared_citers":11},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":11},{"title":"OPT: Open Pre-trained Transformer Language Models","work_id":"d7ff3b21-1fff-4cf4-952a-4714e3ef2307","shared_citers":11}],"time_series":[{"n":6,"year":2019},{"n":6,"year":2020},{"n":3,"year":2021},{"n":9,"year":2022},{"n":12,"year":2023},{"n":3,"year":2024},{"n":1,"year":2025},{"n":156,"year":2026}]},"authors":[{"id":"7234fb01-d92c-46ec-a5f6-b58f1f68dae7","orcid":null,"display_name":"Danqi Chen","source":"manual","import_confidence":0.72},{"id":"5ede1007-9973-4452-ba48-c78827d3385b","orcid":null,"display_name":"Jingfei Du","source":"manual","import_confidence":0.72},{"id":"89203f73-bcbb-42d4-a175-dea3b337ba8b","orcid":null,"display_name":"Mandar Joshi","source":"manual","import_confidence":0.72},{"id":"737c641a-b7d9-40db-a7ba-218dc54c500d","orcid":null,"display_name":"Myle Ott","source":"manual","import_confidence":0.72},{"id":"63b793df-28cb-4279-a2b8-3385a04bd3d4","orcid":null,"display_name":"Naman Goyal","source":"manual","import_confidence":0.72},{"id":"37d3c70e-f094-44d4-a128-a97f09cc7374","orcid":null,"display_name":"Yinhan Liu","source":"manual","import_confidence":0.72}]}}