{"work":{"id":"240c67fe-d14d-4520-91c1-38a4e272ca19","openalex_id":null,"doi":null,"arxiv_id":"1707.06347","raw_key":null,"title":"Proximal Policy Optimization Algorithms","authors":null,"authors_text":"John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov","year":2017,"venue":"cs.LG","abstract":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more general, and have better sample complexity (empirically). Our experiments test PPO on a collection of benchmark tasks, including simulated robotic locomotion and Atari game playing, and we show that PPO outperforms other online policy gradient methods, and overall strikes a favorable balance between sample complexity, simplicity, and wall-time.","external_url":"https://arxiv.org/abs/1707.06347","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-29T09:53:17.533166+00:00","pith_arxiv_id":"1707.06347","created_at":"2026-05-08T18:44:01.332768+00:00","updated_at":"2026-06-29T09:53:17.533166+00:00","title_quality_ok":true,"display_title":"Proximal Policy Optimization Algorithms","render_title":"Proximal Policy Optimization Algorithms"},"hub":{"state":{"work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","tier":"mega_hub","tier_reason":"1,000+ Pith inbound or 100,000+ external citations","pith_inbound_count":1373,"external_cited_by_count":null,"distinct_field_count":48,"first_pith_cited_at":"2017-10-16T18:05:45+00:00","last_pith_cited_at":"2026-06-26T11:30:42+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"needed","recognition_status":"needed","updated_at":"2026-06-29T10:48:36.648871+00:00","tier_text":"mega_hub"},"tier":"mega_hub","role_counts":[{"context_role":"background","n":154},{"context_role":"method","n":113},{"context_role":"baseline","n":15},{"context_role":"dataset","n":4}],"polarity_counts":[{"context_polarity":"background","n":149},{"context_polarity":"use_method","n":109},{"context_polarity":"baseline","n":15},{"context_polarity":"unclear","n":7},{"context_polarity":"use_dataset","n":4},{"context_polarity":"support","n":2}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"Proximal Policy Optimization Algorithms","claims":[{"claim_text":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Proximal Policy Optimization Algorithms because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:53:29.701873+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"298fbbc8-0497-4319-a5d8-d4bcef3f7f3d","orcid":null,"display_name":"John Schulman"},{"id":"31b0570b-235b-4790-add5-bb4bd6cd4ecb","orcid":null,"display_name":"Filip Wolski"},{"id":"be640ed3-1548-4b69-a2ac-76fa45adb08b","orcid":null,"display_name":"Prafulla Dhariwal"},{"id":"9783c850-24f9-4444-91ee-29b7660c744b","orcid":null,"display_name":"Alec Radford"},{"id":"c2a2eca0-5637-4573-bf0c-fc8cadee54bf","orcid":null,"display_name":"Oleg Klimov"}]},"error":null,"updated_at":"2026-05-13T17:24:04.648649+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T17:43:36.017501+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":255},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":137},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":117},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":116},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":75},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":65},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":65},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":56},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":54},{"title":"High-Dimensional Continuous Control Using Generalized Advantage Estimation","work_id":"38e3ca94-96f0-4b19-a355-0754931af8be","shared_citers":52},{"title":"Understanding R1-Zero-Like Training: A Critical Perspective","work_id":"ec354f3b-9484-4a0c-94c8-92d4d0260835","shared_citers":51},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":48},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":47},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":43},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":39},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":36},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":33},{"title":"Fine-Tuning Language Models from Human Preferences","work_id":"4f54aad1-f3b6-404f-b9c7-e21ba0a33b99","shared_citers":32},{"title":"Kimi k1.5: Scaling Reinforcement Learning with LLMs","work_id":"bff96ab1-bd6a-4585-be23-74fdb51969c7","shared_citers":32},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":32},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":31},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":31},{"title":"HybridFlow: A Flexible and Efficient RLHF Framework","work_id":"7eb9c9f4-b322-4bba-8011-09ff8d6ad801","shared_citers":29},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":29}],"time_series":[{"n":1,"year":2017},{"n":1,"year":2018},{"n":4,"year":2019},{"n":1,"year":2020},{"n":2,"year":2021},{"n":1,"year":2022},{"n":9,"year":2023},{"n":6,"year":2024},{"n":18,"year":2025},{"n":591,"year":2026}]},"error":null,"updated_at":"2026-05-13T17:25:54.296292+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T17:43:35.301807+00:00"},"reader_index":{"job_type":"reader_index","status":"succeeded","result":{"note":"annotated reader requires full-text/OA fetch; shell is wired for mega hubs","status":"reader queued"},"error":null,"updated_at":"2026-05-19T18:11:51.668023+00:00"},"recognition_alignment":{"job_type":"recognition_alignment","status":"succeeded","result":{"modules":["IndisputableMonolith.Gravity.PropagationSpeed","IndisputableMonolith.Foundation.PreTemporalForcingOrder","IndisputableMonolith.Physics.LightConeCausalityFromRS","IndisputableMonolith.Cosmology.EtaBPrefactorDerivation","IndisputableMonolith.Physics.MaxwellEquationsFromRS","IndisputableMonolith.Gravity.BlackHoleEntropyFromLedger","IndisputableMonolith.Thermodynamics.FermiDirac","IndisputableMonolith.Gravity.BlackHoleHorizonStates"],"query_chars":984},"error":null,"updated_at":"2026-05-19T18:11:51.666403+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"Proximal Policy Optimization Algorithms","claims":[{"claim_text":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Proximal Policy Optimization Algorithms because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:53:29.698764+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Proximal Policy Optimization Algorithms","claims":[{"claim_text":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Proximal Policy Optimization Algorithms because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:25:52.708624+00:00"}},"summary":{"title":"Proximal Policy Optimization Algorithms","claims":[{"claim_text":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Proximal Policy Optimization Algorithms because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":255},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":137},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":117},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":116},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":75},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":65},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":65},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":56},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":54},{"title":"High-Dimensional Continuous Control Using Generalized Advantage Estimation","work_id":"38e3ca94-96f0-4b19-a355-0754931af8be","shared_citers":52},{"title":"Understanding R1-Zero-Like Training: A Critical Perspective","work_id":"ec354f3b-9484-4a0c-94c8-92d4d0260835","shared_citers":51},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":48},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":47},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":43},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":39},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":36},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":33},{"title":"Fine-Tuning Language Models from Human Preferences","work_id":"4f54aad1-f3b6-404f-b9c7-e21ba0a33b99","shared_citers":32},{"title":"Kimi k1.5: Scaling Reinforcement Learning with LLMs","work_id":"bff96ab1-bd6a-4585-be23-74fdb51969c7","shared_citers":32},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":32},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":31},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":31},{"title":"HybridFlow: A Flexible and Efficient RLHF Framework","work_id":"7eb9c9f4-b322-4bba-8011-09ff8d6ad801","shared_citers":29},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":29}],"time_series":[{"n":1,"year":2017},{"n":1,"year":2018},{"n":4,"year":2019},{"n":1,"year":2020},{"n":2,"year":2021},{"n":1,"year":2022},{"n":9,"year":2023},{"n":6,"year":2024},{"n":18,"year":2025},{"n":591,"year":2026}]},"authors":[{"id":"9783c850-24f9-4444-91ee-29b7660c744b","orcid":null,"display_name":"Alec Radford","source":"manual","import_confidence":0.72},{"id":"31b0570b-235b-4790-add5-bb4bd6cd4ecb","orcid":null,"display_name":"Filip Wolski","source":"manual","import_confidence":0.72},{"id":"298fbbc8-0497-4319-a5d8-d4bcef3f7f3d","orcid":null,"display_name":"John Schulman","source":"manual","import_confidence":0.72},{"id":"c2a2eca0-5637-4573-bf0c-fc8cadee54bf","orcid":null,"display_name":"Oleg Klimov","source":"manual","import_confidence":0.72},{"id":"be640ed3-1548-4b69-a2ac-76fa45adb08b","orcid":null,"display_name":"Prafulla Dhariwal","source":"manual","import_confidence":0.72}]},"citers":{"total":1373,"items":[{"citing_arxiv_id":"2606.27981","ref_index":229,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ToxiREX: A Dataset on Toxic REasoning in ConteXt","primary_cat":"cs.CL","submitted_at":"2026-06-26T11:30:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ToxiREX is a new dataset of 128k Reddit comments in six languages with hierarchical annotations for implicit toxicity in conversational context based on an existing reasoning schema.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25978","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Agent Goal Recognition with Team- and Goal-Conditioned Reinforcement Learning and Factorized Branch-and-Bound","primary_cat":"cs.MA","submitted_at":"2026-06-24T15:50:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAGR-BB matches exhaustive search accuracy on multi-agent Blocksworld while reducing hypothesis evaluations by orders of magnitude via RL scoring inside factorized branch-and-bound.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23257","ref_index":80,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic multi-agent deep reinforcement learning-based pricing and incentivization approach in multimodal transportation networks","primary_cat":"cs.LG","submitted_at":"2026-06-22T12:39:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Multi-agent DRL framework shows dynamic incentives and pricing can cut commuter costs ~20%, emissions ~10%, and double public transport profit in simulated morning peak scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21387","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Long-Distance Real-World Navigation of the Legged-Wheeled Robot Go2-W Using Deep Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2026-06-19T12:53:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A DRL locomotion controller extended from prior quadruped work enabled the Go2-W robot to complete 2.8 km of autonomous real-world navigation including mixed terrain and stairs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18625","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SRL: Combining SLIP Model and Reinforcement Learning for Agile Robotic Jumping","primary_cat":"cs.RO","submitted_at":"2026-06-17T02:41:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"SRL combines SLIP feedforward with RL feedback to produce stable bipedal and quadrupedal jumps with lower training cost than pure RL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11525","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning Object Manipulation from Scratch via Contrastive Interaction","primary_cat":"cs.RO","submitted_at":"2026-06-10T00:06:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"IWR improves CRL sample efficiency and performance in interaction-rich manipulation by interaction-aware resampling that preserves mode boundaries, yielding 19.8% average gains and a real-world air-hockey agent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11167","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Faceted Interactivity Alignment in Full-Duplex Speech Models","primary_cat":"cs.CL","submitted_at":"2026-06-09T17:46:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A multi-axis RL alignment technique improves pause handling, turn-taking, backchanneling, and interruption response in full-duplex spoken dialogue models by optimizing axis-specific rewards derived from human audio segments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20658","ref_index":266,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Expected Free Energy-based Planning as Variational Inference","primary_cat":"cs.AI","submitted_at":"2026-06-09T08:09:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EFE-based planning is formulated as variational free energy minimization with epistemic priors, decomposing into expected plan costs plus a complexity term.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09439","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tracking the Effective Surface Area of Non-Convex Satellites","primary_cat":"eess.SY","submitted_at":"2026-06-08T12:48:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Backstepping control tracks effective surface area of non-convex satellites for drag-based orbital control, with asymptotic stability proofs and an extension for solar panel exposure.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08816","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Knowledge Graphs and Reasoning LLMs for Finding Simple Yet Effective Transcriptomic Perturbation Predictors","primary_cat":"cs.LG","submitted_at":"2026-06-07T20:09:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"K-nearest neighbor from a knowledge graph beats most methods on out-of-distribution transcriptomic perturbation prediction, and an RL-trained reasoning LLM matches SOTA on Replogle et al. (2022) cell lines while improving downstream differential expression prediction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08729","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"IR-SIM: A Lightweight Skill-Native Simulator for Navigation, Learning, and Benchmarking","primary_cat":"cs.RO","submitted_at":"2026-06-07T16:55:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"IR-SIM is a YAML-defined simulator for mobile robot navigation that supports text-prompt scenario creation, policy training, benchmarking, and bridging to higher-fidelity or real-world settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07513","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentopia: Long-Term Life Simulation and Learning in Agent Societies","primary_cat":"cs.CL","submitted_at":"2026-06-05T17:59:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Agentopia runs decade-scale multi-agent LLM simulations to study emergent social behaviors and trains models with life-reward rejection sampling, yielding +15.6% gains on role-playing benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05882","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Market Informedness and Market-Maker Profitability: The Trade-Off Between Adverse Selection and Price Discovery","primary_cat":"q-fin.TR","submitted_at":"2026-06-04T08:53:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Agent-based model with multi-agent RL shows market-maker profitability trends upward overall with rising aggregate market informedness as price-discovery benefits offset adverse-selection costs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05800","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SALT: When More Rollouts Don't Help in Group-Based Policy Optimization and How to Make Them Matter","primary_cat":"cs.LG","submitted_at":"2026-06-04T07:29:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SALT is a subspace-adaptive plug-in for GRPO that decomposes group-relative coefficients into shared and residual channels using mini-batch Gram geometry and amplifies residuals to mitigate signed cancellation in RLVR.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05722","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AISC deployment in dynamic UAV-assisted MEC network: a reinforcement learning method based on heterogeneous graph attention neural network","primary_cat":"cs.NI","submitted_at":"2026-06-04T05:26:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A heterogeneous graph attention Q-network is introduced for AISC deployment that reduces completion time while improving load balance and energy use in dynamic UMEC networks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04935","ref_index":290,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Type of Inference is Active Inference?","primary_cat":"cs.AI","submitted_at":"2026-06-03T14:24:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EFE-based active inference planning is characterized as VFE on an augmented model plus entropy and planning corrections, with a derived message-passing implementation and grid-world validation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04735","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Trace-Mediated Peak Bias: Bridging Temporal Credit Assignment and Cognitive Heuristics in Deep Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-06-03T11:19:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Eligibility traces in deep RL create a peak bias by amplifying distal TD errors into gradient shocks that fixed-step SGD cannot normalize, leading to overestimation of peak-reward trajectories and a mechanistic account of the peak-end rule.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04574","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Multi-Pair Trading Strategy in Cryptocurrency Markets with Deep Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-06-03T08:10:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A hybrid DRL system for multi-pair crypto trading with deterministic risk shielding outperforms a heuristic baseline at 10% significance on Binance futures data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04471","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Optimizing Control of Continuous Processes Based on Reinforcement Learning","primary_cat":"eess.SY","submitted_at":"2026-06-03T05:33:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Reinforcement learning optimizes controlled variable selection for self-optimizing control by embedding the structure in an actor network and using economic rewards, showing better dynamic performance than a steady-state baseline in a CSTR simulation under disturbances.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01565","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hierarchical Semantic-Augmented Navigation: Optimal Transport and Graph-Driven Reasoning for Vision-Language Navigation","primary_cat":"cs.RO","submitted_at":"2026-06-01T02:11:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"HSAN integrates hierarchical semantic graphs, optimal transport-based goal selection, and graph-aware RL to claim SOTA results on VLN-CE tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01332","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"S2M-Trek: From Single to Multi-Sphere Transport via Per-Frame Deep Sets on a Wheel-Legged Robot","primary_cat":"cs.RO","submitted_at":"2026-05-31T16:35:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Per-Frame Deep Sets enables scaling single-sphere to five-sphere transport on a quadruped by performing permutation-invariant pooling within each history frame, reaching 100% no-drop success in simulation where standard encoders plateau.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01028","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MedGym:A Unified Continuous-Time Benchmark for Dynamic Medical Treatment Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-31T05:36:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MedGym introduces a continuous-time RL benchmark for medical treatment derived from clinical data via PINNs, supporting offline/online evaluation on personalization, safety, and discrete vs continuous methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00950","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"COLLIE: Guiding Skill Discovery in Semantically Coherent Latent Space","primary_cat":"cs.LG","submitted_at":"2026-05-31T02:04:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"COLLIE constructs a semantically coherent skill latent space from unsupervised data to enable training-free guidance with sparse online feedback in guided skill discovery.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02636","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Too Much of a Good Thing: When sim2real Efforts Impede Policy Learning (And What to Do About It)","primary_cat":"cs.RO","submitted_at":"2026-05-30T22:17:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Excessive sim2real focus impedes robotics policy learning via simulator lock-in; a kinematics-only sim2sim2real paradigm is proposed to restore exploration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00880","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Task diversity produces systematic transfer but inhibits continual reinforcement learning","primary_cat":"cs.LG","submitted_at":"2026-05-30T20:31:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Task diversity along map, object, and hierarchy axes produces local transfer across shifts in a new continual RL benchmark but fails to sustain learning as the number of shifts grows.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00840","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Certificate-Guided Evaluation of Reinforcement Learning Generalization","primary_cat":"cs.AI","submitted_at":"2026-05-30T18:31:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A logic-driven framework defines inductive reach-avoid tasks and uses neural certificates to certify RL generalization, with empirical results linking fewer violations to more solved test tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00702","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Shape Your Body: Value Gradients for Multi-Embodiment Robot Design","primary_cat":"cs.RO","submitted_at":"2026-05-30T12:21:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Trains embodiment-aware value functions on up to 50 robots and applies their gradients as differentiable surrogates to optimize held-out robot designs with over 1100 parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00674","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Paradox of Outcome Optimization: A Causal Information-Theoretic Bound on Reasoning Shortcuts in LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-30T11:06:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Outcome optimization induces reward-induced manifold collapse in LLMs by favoring low-complexity spurious correlations over high-complexity causal reasoning, with process reward models acting as topological filters to block shortcuts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00651","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MESA: Improving MoE Safety Alignment via Decentralized Expertise","primary_cat":"cs.LG","submitted_at":"2026-05-30T09:54:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MESA decentralizes safety duties in MoE LLMs via expert capacity reallocation and dynamic routing refinement based on optimal transport theory, yielding robust defense on harmful benchmarks while preserving helpfulness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00637","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Global-Local Attention Decomposition for Terrain Encoding in Humanoid Perceptive Locomotion","primary_cat":"cs.RO","submitted_at":"2026-05-30T09:23:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GLAD decomposes terrain encoding via coarse-to-fine attention on elevation maps to separate broad awareness from precise foothold selection in perceptive humanoid locomotion.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00595","ref_index":74,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Elastohydrodynamic coupling enhances flow generation by coordinated ciliary beating","primary_cat":"physics.bio-ph","submitted_at":"2026-05-30T07:52:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Reinforcement learning on a bead-spring cilia model identifies antiplectic coordination as flow-maximizing, with a tilted-slider reduced model showing that a time-averaged position shift opposite the effective stroke enhances transport via elastic restoring force coupling, and that symplectic coordi","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00593","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPADER: Step-wise Peer Advantage with Diversity-Aware Exploration Rewards for Multi-Answer Question Answering","primary_cat":"cs.CL","submitted_at":"2026-05-30T07:47:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPADER proposes step-wise peer advantage and diversity-aware exploration rewards in RL for multi-answer QA, reporting improved recall and F1 on QAMPARI, Mintaka, WebQSP, and QUEST.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00583","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Improving Visual Representation Alignment Generation with GRPO","primary_cat":"cs.CV","submitted_at":"2026-05-30T07:21:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VRPO applies generative representation policy optimization to dynamically align diffusion features with pretrained visual encoders, claiming +1.8 FID gains and 2.3x faster training versus REPA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00440","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SDR: Set-Distance Rewards for Radiology Report Generation","primary_cat":"cs.AI","submitted_at":"2026-05-30T00:10:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Set-to-set distances on sentence embeddings provide a permutation-invariant reward signal that improves GRPO training and enables efficient test-time scaling for vision-language models generating chest X-ray reports.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00400","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Proxy-Mixing: Transferring Replay Controllers from Small to Large Models for Continual Instruction Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-29T22:32:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PROXYMIX learns a dynamic replay controller on a small proxy model and transfers it to a large target model, improving accuracy by 3.4 points and reducing forgetting by 3.5 points on LLaMA-3-8B continual tuning sequences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00270","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Robust Shielding for Safe Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2026-05-29T19:01:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A sound and optimal shielding method for robust MDPs ensures LTL safety under worst-case transitions and combines with PAC sampling to produce minimally restrictive shields for learned models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31494","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Consolidating Rewarded Perturbations for LLM Post-Training","primary_cat":"cs.CL","submitted_at":"2026-05-29T16:16:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoRP consolidates reward-weighted perturbations into a single model via low-rank structure, improving base LLMs by 8.1 points on average while using one-tenth the budget of prior ensembles and one forward pass.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31455","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DRIFT: Decoupled Rollouts and Importance-Weighted Fine-Tuning for Efficient Multi-Turn Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-29T15:49:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DRIFT achieves multi-turn RL performance via offline importance-weighted SFT by leveraging the equivalence of KL-regularized RL to weighted supervised learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31312","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning from Fine-Grained Visual Discrepancies: Mitigating Multimodal Hallucinations via In-Context Visual Contrastive Optimization","primary_cat":"cs.CV","submitted_at":"2026-05-29T13:44:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IC-VCO places contrastive images in one context for a consistent DPO-style objective, adds Visual Contrast Distillation, and uses semantic perturbation for hard negatives, reporting best results on five benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31261","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Linear Recurrent Memory Works in Partially Observable Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-29T12:56:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Linear recurrent filters exactly reproduce HMM belief logits under deterministic transitions and achieve near-zero decoding error under nearly deterministic ones, extending to action-controlled cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07602","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sample-Efficient Post-Training for LEGO Spatial-Physics Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-29T09:31:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PVPO is a sample-efficient RL method that improves semantic, geometric, and physical quality in LLM LEGO assembly generation by mitigating the PhysHack failure mode where validity alone fails to ensure fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31023","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HADT: A Heterogeneous Multi-Agent Differential Transformer for Autonomous Earth Observation Satellite Cluster","primary_cat":"cs.AI","submitted_at":"2026-05-29T08:54:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes HADT, a heterogeneous multi-agent differential transformer with relational observations-actions tokenization for model-free RL-based autonomous resource management in EO satellite clusters, claiming gains over baselines and adaptability to cluster size changes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30957","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RDGen: Demonstration Generation for High-Quality Robot Learning via Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2026-05-29T07:53:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"RDGen uses sim-to-real RL policies to generate smoother robot demonstrations that improve downstream VLA performance over human-collected data on pick-and-place tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30919","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"De-attribute to Forget for LLM Unlearning","primary_cat":"cs.LG","submitted_at":"2026-05-29T07:03:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DareU reframes LLM unlearning as zeroing data attribution via RL rewards from an LLM classifier approximation, claiming better balance of forget quality and model utility than loss-based baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30916","ref_index":58,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Welfare, Improvability, and Variance: A Principal-Agent Approach to Optimal Benchmark Item Aggregation","primary_cat":"cs.LG","submitted_at":"2026-05-29T07:01:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Models benchmarking as principal-agent game, derives welfare loss from welfare alignment, improvability and variance, and applies an audit framework to OLMES items.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30914","ref_index":100,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Automating Formal Verification with Reinforcement Learning and Recursive Inference","primary_cat":"cs.LG","submitted_at":"2026-05-29T06:59:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RLVR training raises verified Dafny pass rates from 9.7% to 31.1% on a filtered benchmark while a Lean proof scaffold lifts success from 46.2% to 69.2% on a pilot set and solves 7 of 42 prior unsolved tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30896","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Zero Collapse: A Failure Mode of Policy Gradient Methods in Discontinuous Reward Environments","primary_cat":"cs.LG","submitted_at":"2026-05-29T06:29:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Policy gradient methods suffer from zero collapse in discontinuous reward environments such as first-price auctions, where exploration causes policies to enter flat zero-reward regions from which recovery is sample-inefficient due to absent gradient signals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30873","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Federated Variational Preference Alignment with Gumbel-Softmax Prior for Personalized User Preferences","primary_cat":"cs.LG","submitted_at":"2026-05-29T05:52:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FedVPA-GP applies variational preference learning in a federated setting with a mixture prior and orthogonal loss to disentangle user preferences on the HH-RLHF dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30859","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DARTS: Distribution-Aware Active Rollout Trajectory Shaping for Accelerating LLM Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-29T05:31:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DARTS accelerates LLM RL training up to 1.77x by distribution-aware trajectory sampling and adaptive redundancy allocation that shapes rollouts toward conciseness without performance loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30795","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Feat2Go: Visual Feature-Grounded Value Estimation for Embodied Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2026-05-29T03:36:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Feat2Go uses patch-level similarity from a visual world model and trend-based clustering to create progress targets for training value models that improve reward shaping in embodied RL for VLA policies, yielding large gains on ManiSkill3 and RoboTwin benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30789","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Smaller Models are Natural Explorers for Policy-Level Diversity in GRPO","primary_cat":"cs.LG","submitted_at":"2026-05-29T03:25:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Smaller models provide temporally correlated policy-level diversity that serves as structured exploration for training larger models in GRPO, yielding accuracy gains such as +8.8% on AIME 24 with reduced compute via the S2L-PO framework.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30770","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SSR: Scaling Surefooted and Symmetric Humanoid Traversal to the Open World","primary_cat":"cs.RO","submitted_at":"2026-05-29T02:54:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SSR is an end-to-end vision-based framework for humanoid traversal that learns imagined foothold guidance, equivariant latent-space symmetry augmentation, and terrain-specific multi-discriminator motion priors to enable safe locomotion on diverse real-world terrains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30749","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FLAG: Flow Policy MaxEnt-RL by Latent Augmented Guidance","primary_cat":"cs.LG","submitted_at":"2026-05-29T02:25:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FLAG augments state space with flow latent variable to optimize a proxy MaxEnt-RL objective, enabling expressive policies with limited importance samples in high-dimensional control.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00143","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Regime-Adaptive Continual Learning for Portfolio Management","primary_cat":"q-fin.PM","submitted_at":"2026-05-29T02:24:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ReCAP segments markets into regimes, builds a policy library via continual learning, and uses a regime-gate to adapt trading policies, claiming superior returns and fast adaptation on five real datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24892","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReviewGuard: Aligning LLM-Assisted Peer Review with Long-Term Scientific Impact","primary_cat":"cs.DL","submitted_at":"2026-05-29T02:05:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReviewGuard aligns LLM peer reviews with future citations via impact-aligned RL, achieving Spearman ρ=0.776 on rejected-then-published AI/ML papers versus 0.492 for human reviewers and flagging 5.6× more high-impact cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30719","ref_index":17,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When are LLMs Sufficient Policy Optimizers for Sequential RL Tasks?","primary_cat":"cs.LG","submitted_at":"2026-05-29T01:24:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PromptPO shows LLMs can act as black-box policy optimizers for sequential RL when leveraging prior knowledge, matching baselines in exploration and robotics but underperforming in MuJoCo.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30712","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ExpGraph: Model-Agnostic Experience Learning with Graph-Structured Memory for LLM Agents","primary_cat":"cs.CL","submitted_at":"2026-05-29T01:04:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ExpGraph builds a graph of summarized agent experiences and uses graph diffusion plus an RL-trained retrieval copilot to improve frozen LLM executors on QA, math, code, and agentic tasks without parameter updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00135","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On Effectiveness and Efficiency of Agentic Tool-calling and RL Training","primary_cat":"cs.LG","submitted_at":"2026-05-28T22:21:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Tool-calling evaluations for LLM agents are highly sensitive to implementation details such as random seeds and history handling, and two new techniques accelerate RL training with wall-clock speedup and no performance degradation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00133","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"World Models: A Comprehensive Survey of Architectures, Methodologies, Reasoning Paradigms, and Applications","primary_cat":"cs.LG","submitted_at":"2026-05-28T21:23:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The paper delivers a multi-axis taxonomy for world models that maps architectures, training families, reasoning strategies, and domains from early cognitive foundations through systems such as Dreamer, MuZero, and Sora while noting evaluation gaps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30524","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Representation Collapse in Sequential Post-Training of Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-28T19:59:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Sequential post-training of LLMs induces representation collapse that correlates with reduced plasticity, weaker generalization, and poorer calibration, with lightweight interventions tested to mitigate it.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30451","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VeriGate: Verifier-Gated Step-Level Supervision for GRPO","primary_cat":"cs.LG","submitted_at":"2026-05-28T18:20:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VeriGate adds verifier-gated step-level supervision to GRPO via cumulated PRM rewards and group-normalized token advantages, raising accuracy 20% and 12% on 1.5B and 7B models on MATH and six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30313","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniLab: A Heterogeneous Architecture for Robot RL Beyond GPU-Dominant Paradigms","primary_cat":"cs.RO","submitted_at":"2026-05-28T17:53:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniLab is a CPU/GPU heterogeneous system for robot RL training using MuJoCoUni and MotrixSim backends that reports 3-10x end-to-end efficiency improvements and cross-platform compatibility beyond CUDA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30220","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TriSearch: Learning to Optimize Triangulations via Bistellar Flips","primary_cat":"cs.LG","submitted_at":"2026-05-28T16:54:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TriSearch is an RL framework that optimizes triangulations of polytopes using bistellar flips with a circuit-supported subtriangulation action representation, generalizing zero-shot to larger instances and outperforming prior samplers in 3D and 4D.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30201","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HPO: Hysteretic Policy Optimization for Stable and Efficient Training under Sparse-Reward Regime","primary_cat":"cs.LG","submitted_at":"2026-05-28T16:38:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"HPO and A-HPO modify GRPO by reducing negative advantage weights and using mean length normalization, yielding higher rewards in early sparse-reward stages on TeleLogs (0.84 final) and Countdown benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30160","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On Distributional Reinforcement Learning in Chaotic Dynamical Systems","primary_cat":"cs.LG","submitted_at":"2026-05-28T16:17:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Distributional RL yields smoother objectives in chaotic systems because return distributions evolve more regularly than individual trajectories under the 1-Wasserstein metric.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30154","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RL2ML: Finite-Rollout Surrogate Objectives from Reinforcement Learning to Maximum Likelihood","primary_cat":"cs.LG","submitted_at":"2026-05-28T16:14:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RL2ML introduces a parameterized family of surrogate objectives bridging RL and ML with unbiased gradient estimators, group-level update-scale analysis, and metric-dependent optimization for finite-rollout LLM training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30056","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sample-Efficient Diffusion-based Reinforcement Learning with Critic Guidance","primary_cat":"cs.RO","submitted_at":"2026-05-28T15:07:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CGPO integrates training-free critic guidance into diffusion denoising to produce high-Q actions as regression targets, yielding SOTA results on MuJoCo locomotion and successful Franka arm grasping.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29860","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ESPO: Early-Stopping Proximal Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-28T12:40:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ESPO adds on-the-fly early stopping to PPO rollouts for LLM math reasoning using cumulative surrogate regret, improving AIME, AMC, and MATH-500 scores over PPO while cutting over 20% rollout tokens on a 7B model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29823","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Quantifying and Optimizing Simplicity via Polynomial Representations","primary_cat":"cs.AI","submitted_at":"2026-05-28T12:05:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Polynomial representations yield an effective-degree simplicity metric that predicts generalization across tasks and serves as a differentiable regularizer improving performance in classification and RL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29782","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hista and Numca: Estimate State Value Effectively for LLM Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-28T11:31:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces SVEB benchmark and Numca/Hista methods claiming more accurate state value estimates and better RL training performance for LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05216","ref_index":194,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Comprehensive Survey on Semantic Communication in Non-Terrestrial Networks: Architectures, Methodologies, and Challenges","primary_cat":"cs.IT","submitted_at":"2026-05-28T09:02:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A literature survey that pairs NTN limitations with semantic communication properties, organizes work by platform and methodology, and lists open problems for integrated SAGIN systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29625","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Improving Collaborative Storytelling with a Multi-Agent Framework Based on Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-28T08:59:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"An iterative writer-editor multi-agent LLM process improves perceived story quality in simulations of child collaborative storytelling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29582","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PEARL: Training Socratic Tutors with Pedagogically Aligned Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-28T08:25:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PEARL is a pedagogically aligned RL framework using a controllable student simulator, generative reward model, and stable multi-objective scheme to train Socratic tutors that outperform other open-source models on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29425","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReasonLight: A Multimodal Foundation Model-Enhanced Reinforcement Learning Framework for Zero-Shot Traffic Signal Control","primary_cat":"cs.AI","submitted_at":"2026-05-28T06:19:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReasonLight uses multimodal foundation models to refine RL-proposed traffic signal phases based on camera images and sensor data, enabling zero-shot adaptation to unseen events such as emergency vehicle priority.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29421","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning Design Skills as Memory Policies for Agentic Photonic Inverse Design","primary_cat":"cs.CL","submitted_at":"2026-05-28T06:14:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SkillPCF is a closed-loop agent framework with a physics-guided memory skill bank, reinforcement-learned skill selection, and simulator-grounded evolution that improves design quality and efficiency for photonic crystal fiber inverse design under limited simulation budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29254","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Extreme dynamic symmetry enables omnidirectional and multifunctional robots","primary_cat":"cs.RO","submitted_at":"2026-05-28T02:15:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Dynamic isotropy, quantifying uniform center-of-mass acceleration capability, improves robot performance and enables omnidirectional locomotion, terrain traversal, and failure resilience in a spherical robot design.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29198","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Guidance Contrastive Token Credit Assignment for Discrete Policy Optimization","primary_cat":"cs.CV","submitted_at":"2026-05-28T00:17:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GCPO performs per-token credit assignment in discrete policy optimization by setting token advantages proportional to the difference in model predictions under positive versus negative prompts, outperforming GRPO and DAPO on text-to-image and chain-of-thought tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27895","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Application of Reinforcement Learning for Multigroup Energy Grid Optimization for Neutron Transport Criticality Problems","primary_cat":"physics.comp-ph","submitted_at":"2026-05-27T03:19:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reinforcement learning with surrogate models optimizes multigroup energy structures for 1D spherical k-criticality problems and outperforms standard structures on Godiva and BeRP test cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23903","ref_index":66,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Geo-Align: Video Generation Alignment via Metric Geometry Reward","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:59:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Geo-Align applies RL with a perceptual reward derived from 3D camera trajectory estimation to improve controllability and fidelity in video generation without paired training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23863","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Robotic Strawberry Harvesting with Robust Vision and Deep Reinforcement Learning based Sim-to-Real Control","primary_cat":"cs.RO","submitted_at":"2026-05-22T17:21:06+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A modified YOLO segmentation model plus sim-trained PPO control yields 84.3% overall success harvesting 281 strawberries in greenhouse trials on a real UR10e manipulator.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23762","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Direct Dynamic Retargeting for Humanoid Imitation Learning from Videos","primary_cat":"cs.RO","submitted_at":"2026-05-22T15:33:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DDR is a single-stage task-space framework using sampling-based MPC in a physics simulator to produce high-fidelity dynamically feasible references from video demos, claimed to outperform geometric and indirect retargeting baselines in tracking accuracy and to speed up RL training for agile humanoid","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23717","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Vision-Based Agile Landing on Turbulent Waters","primary_cat":"cs.RO","submitted_at":"2026-05-22T14:59:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reinforcement learning policy trained on synthetic visual features in simulation enables zero-shot real-world agile multirotor landing on turbulent maritime platforms without explicit platform-state estimation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23652","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"One Policy, Infinite NPCs: Persona-Traceable Shared RL Policies for Scalable Game Agents","primary_cat":"cs.AI","submitted_at":"2026-05-22T14:04:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"pcsp is a shared RL policy using LLM persona embeddings, low-rank projection, and PPO+InfoNCE+KL training that delivers 17x above-chance zero-shot persona identification and 22x faster inference on a 300-persona benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23565","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Understanding Goal Generalisation in Sequential Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-22T12:31:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical analysis of over 100 sequential RL training pipelines across 250+ OOD environments finds salient features drive generalization and early goals persist, with latent policy gradients simulating latent variable evolution to predict OOD behavior from training history.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23560","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SafeSABR: Risk-Calibrated Adaptive Bitrate Streaming over Starlink Networks","primary_cat":"eess.SY","submitted_at":"2026-05-22T12:27:47+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23551","ref_index":63,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Goal-Conditioned Agents that Learn Everything All at Once","primary_cat":"cs.LG","submitted_at":"2026-05-22T12:17:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LEO enables efficient all-goals learning in goal-conditioned RL by jointly predicting for all goals in one network pass, yielding >250x speedup over relabelling and better performance on Craftax.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23493","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EDGE-OPD: Internalizing Privileged Context with Evidence Guided On-Policy Distillation","primary_cat":"cs.AI","submitted_at":"2026-05-22T10:55:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EDGE-OPD adds guided rollouts and evidence masking to on-policy self-distillation, enabling successful learning of target identities where standard OPSD and RLSD fail.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23463","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StepAudio 2.5 Technical Report","primary_cat":"eess.AS","submitted_at":"2026-05-22T10:24:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StepAudio 2.5 is a unified audio-language foundation model that reaches state-of-the-art results on ASR, TTS, and realtime interaction by using task-tailored RLHF on a shared backbone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23435","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MileStone: A Multi-Objective Compiler Phase Ordering Framework for Graph-based IR-Level Optimization","primary_cat":"cs.PL","submitted_at":"2026-05-22T09:45:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MileStone models compiler phase ordering as a multi-objective optimization problem using graph representations, GNN predictions, and RL agents to find Pareto-optimal pass sequences under user constraints.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23415","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reflex: Reinforcement Learning with Reflection Symmetry Exploitation in State-Based Continuous Control","primary_cat":"cs.LG","submitted_at":"2026-05-22T09:24:44+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23398","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TPMM-DPO: Trajectory-aware Preference-guided Model Merging for Iterative Direct Preference Optimization","primary_cat":"cs.IR","submitted_at":"2026-05-22T09:11:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TPMM-DPO applies trajectory-aware learned-weight merging of prior policy models to stabilize iterative DPO against preference noise accumulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23365","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Score-Based One-step MeanFlow Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-22T08:28:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SOM is an actor-critic algorithm that constructs the target velocity field for one-step MeanFlow policies directly from the Q-function via score estimation and probability flow ODE, achieving claimed SOTA on locomotion tasks with reduced training and inference time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23285","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforcement Learning for Microcanonical Graph Ensemble with Assortativity Constraints","primary_cat":"cs.LG","submitted_at":"2026-05-22T06:57:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DMGG uses reinforcement learning to generate microcanonical graph ensembles with exact assortativity constraints via degree-preserving rewirings, claiming faster generation and better diversity than ERGM approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23261","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniSRM: A Unified Speech Reward Model for Reasoning-Based Fine-grained Assessment","primary_cat":"eess.AS","submitted_at":"2026-05-22T06:02:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"UniSRM is a unified speech reward model with new datasets that uses a two-stage reasoning pipeline to deliver interpretable, human-aligned evaluations across utterance quality to context coherence tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23067","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Training Data Teaches RL Memory Agents: An Empirical Study of Curriculum Effects in Memory-Augmented QA","primary_cat":"cs.CL","submitted_at":"2026-05-21T21:58:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Controlled study shows mixed training curricula improve aggregate F1 on memory QA benchmarks while out-of-domain data transfers targeted skills like temporal reasoning, with per-question-type effects exceeding aggregate differences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22814","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Remember to be Curious: Episodic Context and Persistent Worlds for 3D Exploration","primary_cat":"cs.LG","submitted_at":"2026-05-21T17:58:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A curiosity-based 3D exploration policy that pairs persistent online 3D reconstruction with episodic sequence modeling over RGB to outperform active-mapping baselines on HM3D and transfer zero-shot to Gibson and synthetic worlds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22773","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deep Reinforcement Learning for Flexible Job Shop Scheduling with Random Job Arrivals","primary_cat":"cs.AI","submitted_at":"2026-05-21T17:33:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A PPO-trained DRL agent selects from established dispatching rules to minimize total job completion time in FJSP with random arrivals, outperforming single rules and performing competitively with arrival-triggered MILP on heterogeneous datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22748","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Superhuman Safe and Agile Racing through Multi-Agent Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2026-05-21T17:15:54+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22731","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Post-Training is About States, Not Tokens: A State Distribution View of SFT, RL, and On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-21T17:03:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A state distribution view of post-training shows that on-policy supervision from the learner itself can outperform fixed-dataset SFT and preserve retention better than aggressive supervised updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22703","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Clipping Bottleneck: Stabilizing RLVR via Stochastic Recovery of Near-Boundary Signals","primary_cat":"cs.LG","submitted_at":"2026-05-21T16:45:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Proposes Near-boundary Stochastic Rescue (NSR) as a stochastic modification to clipping in RLVR that recovers near-boundary signals and yields gains over baselines like DAPO and GSPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":100,"offset":0}}