{"work":{"id":"240c67fe-d14d-4520-91c1-38a4e272ca19","openalex_id":null,"doi":null,"arxiv_id":"1707.06347","raw_key":null,"title":"Proximal Policy Optimization Algorithms","authors":null,"authors_text":"John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov","year":2017,"venue":"cs.LG","abstract":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more general, and have better sample complexity (empirically). Our experiments test PPO on a collection of benchmark tasks, including simulated robotic locomotion and Atari game playing, and we show that PPO outperforms other online policy gradient methods, and overall strikes a favorable balance between sample complexity, simplicity, and wall-time.","external_url":"https://arxiv.org/abs/1707.06347","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-29T11:53:23.682784+00:00","pith_arxiv_id":"1707.06347","created_at":"2026-05-08T18:44:01.332768+00:00","updated_at":"2026-06-29T11:53:23.682784+00:00","title_quality_ok":true,"display_title":"Proximal Policy Optimization Algorithms","render_title":"Proximal Policy Optimization Algorithms"},"hub":{"state":{"work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","tier":"mega_hub","tier_reason":"1,000+ Pith inbound or 100,000+ external citations","pith_inbound_count":1375,"external_cited_by_count":null,"distinct_field_count":48,"first_pith_cited_at":"2017-10-16T18:05:45+00:00","last_pith_cited_at":"2026-06-26T11:30:42+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"needed","recognition_status":"needed","updated_at":"2026-06-29T11:58:41.899749+00:00","tier_text":"mega_hub"},"tier":"mega_hub","role_counts":[{"context_role":"background","n":154},{"context_role":"method","n":113},{"context_role":"baseline","n":15},{"context_role":"dataset","n":4}],"polarity_counts":[{"context_polarity":"background","n":149},{"context_polarity":"use_method","n":109},{"context_polarity":"baseline","n":15},{"context_polarity":"unclear","n":7},{"context_polarity":"use_dataset","n":4},{"context_polarity":"support","n":2}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"Proximal Policy Optimization Algorithms","claims":[{"claim_text":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Proximal Policy Optimization Algorithms because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:53:29.701873+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"298fbbc8-0497-4319-a5d8-d4bcef3f7f3d","orcid":null,"display_name":"John Schulman"},{"id":"31b0570b-235b-4790-add5-bb4bd6cd4ecb","orcid":null,"display_name":"Filip Wolski"},{"id":"be640ed3-1548-4b69-a2ac-76fa45adb08b","orcid":null,"display_name":"Prafulla Dhariwal"},{"id":"9783c850-24f9-4444-91ee-29b7660c744b","orcid":null,"display_name":"Alec Radford"},{"id":"c2a2eca0-5637-4573-bf0c-fc8cadee54bf","orcid":null,"display_name":"Oleg Klimov"}]},"error":null,"updated_at":"2026-05-13T17:24:04.648649+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T17:43:36.017501+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":255},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":137},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":117},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":116},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":75},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":65},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":65},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":56},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":54},{"title":"High-Dimensional Continuous Control Using Generalized Advantage Estimation","work_id":"38e3ca94-96f0-4b19-a355-0754931af8be","shared_citers":52},{"title":"Understanding R1-Zero-Like Training: A Critical Perspective","work_id":"ec354f3b-9484-4a0c-94c8-92d4d0260835","shared_citers":51},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":48},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":47},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":43},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":39},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":36},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":33},{"title":"Fine-Tuning Language Models from Human Preferences","work_id":"4f54aad1-f3b6-404f-b9c7-e21ba0a33b99","shared_citers":32},{"title":"Kimi k1.5: Scaling Reinforcement Learning with LLMs","work_id":"bff96ab1-bd6a-4585-be23-74fdb51969c7","shared_citers":32},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":32},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":31},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":31},{"title":"HybridFlow: A Flexible and Efficient RLHF Framework","work_id":"7eb9c9f4-b322-4bba-8011-09ff8d6ad801","shared_citers":29},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":29}],"time_series":[{"n":1,"year":2017},{"n":1,"year":2018},{"n":4,"year":2019},{"n":1,"year":2020},{"n":2,"year":2021},{"n":1,"year":2022},{"n":9,"year":2023},{"n":6,"year":2024},{"n":18,"year":2025},{"n":591,"year":2026}]},"error":null,"updated_at":"2026-05-13T17:25:54.296292+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T17:43:35.301807+00:00"},"reader_index":{"job_type":"reader_index","status":"succeeded","result":{"note":"annotated reader requires full-text/OA fetch; shell is wired for mega hubs","status":"reader queued"},"error":null,"updated_at":"2026-05-19T18:11:51.668023+00:00"},"recognition_alignment":{"job_type":"recognition_alignment","status":"succeeded","result":{"modules":["IndisputableMonolith.Gravity.PropagationSpeed","IndisputableMonolith.Foundation.PreTemporalForcingOrder","IndisputableMonolith.Physics.LightConeCausalityFromRS","IndisputableMonolith.Cosmology.EtaBPrefactorDerivation","IndisputableMonolith.Physics.MaxwellEquationsFromRS","IndisputableMonolith.Gravity.BlackHoleEntropyFromLedger","IndisputableMonolith.Thermodynamics.FermiDirac","IndisputableMonolith.Gravity.BlackHoleHorizonStates"],"query_chars":984},"error":null,"updated_at":"2026-05-19T18:11:51.666403+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"Proximal Policy Optimization Algorithms","claims":[{"claim_text":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Proximal Policy Optimization Algorithms because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:53:29.698764+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Proximal Policy Optimization Algorithms","claims":[{"claim_text":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Proximal Policy Optimization Algorithms because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:25:52.708624+00:00"}},"summary":{"title":"Proximal Policy Optimization Algorithms","claims":[{"claim_text":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Proximal Policy Optimization Algorithms because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":255},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":137},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":117},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":116},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":75},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":65},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":65},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":56},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":54},{"title":"High-Dimensional Continuous Control Using Generalized Advantage Estimation","work_id":"38e3ca94-96f0-4b19-a355-0754931af8be","shared_citers":52},{"title":"Understanding R1-Zero-Like Training: A Critical Perspective","work_id":"ec354f3b-9484-4a0c-94c8-92d4d0260835","shared_citers":51},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":48},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":47},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":43},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":39},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":36},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":33},{"title":"Fine-Tuning Language Models from Human Preferences","work_id":"4f54aad1-f3b6-404f-b9c7-e21ba0a33b99","shared_citers":32},{"title":"Kimi k1.5: Scaling Reinforcement Learning with LLMs","work_id":"bff96ab1-bd6a-4585-be23-74fdb51969c7","shared_citers":32},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":32},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":31},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":31},{"title":"HybridFlow: A Flexible and Efficient RLHF Framework","work_id":"7eb9c9f4-b322-4bba-8011-09ff8d6ad801","shared_citers":29},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":29}],"time_series":[{"n":1,"year":2017},{"n":1,"year":2018},{"n":4,"year":2019},{"n":1,"year":2020},{"n":2,"year":2021},{"n":1,"year":2022},{"n":9,"year":2023},{"n":6,"year":2024},{"n":18,"year":2025},{"n":591,"year":2026}]},"authors":[{"id":"9783c850-24f9-4444-91ee-29b7660c744b","orcid":null,"display_name":"Alec Radford","source":"manual","import_confidence":0.72},{"id":"31b0570b-235b-4790-add5-bb4bd6cd4ecb","orcid":null,"display_name":"Filip Wolski","source":"manual","import_confidence":0.72},{"id":"298fbbc8-0497-4319-a5d8-d4bcef3f7f3d","orcid":null,"display_name":"John Schulman","source":"manual","import_confidence":0.72},{"id":"c2a2eca0-5637-4573-bf0c-fc8cadee54bf","orcid":null,"display_name":"Oleg Klimov","source":"manual","import_confidence":0.72},{"id":"be640ed3-1548-4b69-a2ac-76fa45adb08b","orcid":null,"display_name":"Prafulla Dhariwal","source":"manual","import_confidence":0.72}]}}