{"work":{"id":"c25e8154-fab2-455c-8a26-56e40aed5d2b","openalex_id":null,"doi":null,"arxiv_id":null,"raw_key":"raw:0872224998f434b947c81c3c","title":"Advances in neural information processing systems , volume=","authors":null,"authors_text":"Training language models to follow instructions with human feedback , author=","year":null,"venue":null,"abstract":null,"external_url":null,"cited_by_count":null,"metadata_source":"raw_reference","metadata_fetched_at":"2026-05-27T11:04:00.525486+00:00","pith_arxiv_id":null,"created_at":"2026-05-11T05:36:27.281333+00:00","updated_at":"2026-05-27T11:04:00.525486+00:00","title_quality_ok":false,"display_title":"Advances in neural information processing systems , volume=","render_title":"Advances in neural information processing systems , volume="},"hub":{"state":{"work_id":"c25e8154-fab2-455c-8a26-56e40aed5d2b","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":97,"external_cited_by_count":null,"distinct_field_count":10,"first_pith_cited_at":"2024-04-29T04:11:28+00:00","last_pith_cited_at":"2026-05-22T05:25:00+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-29T22:50:29.200841+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":5},{"context_role":"method","n":2}],"polarity_counts":[{"context_polarity":"background","n":3},{"context_polarity":"unclear","n":2},{"context_polarity":"use_method","n":2}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-15T05:17:57.583286+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":16},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":14},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":13},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":12},{"title":"Advances in neural information processing systems , volume=","work_id":"b0092220-e738-45d3-97b1-bd1f0bd834c5","shared_citers":9},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":9},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":8},{"title":"Advances in Neural Information Processing Systems , volume=","work_id":"be2b69de-45c4-4db5-ab23-0bff300c6059","shared_citers":7},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":7},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":7},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":7},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":6},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":6},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":6},{"title":"Advances in neural information processing systems , volume=","work_id":"4a77e424-cf16-4c16-8d94-2ee44db893d1","shared_citers":5},{"title":"Machine learning , volume=","work_id":"c1d54d34-8857-4268-be1f-002a7c436d05","shared_citers":5},{"title":"Tulu 3: Pushing Frontiers in Open Language Model Post-Training","work_id":"28c9dbea-056a-48c2-8000-85f809827e45","shared_citers":5},{"title":"Advances in neural information processing systems , volume=","work_id":"1265447d-0324-4d07-abba-34fa29d172da","shared_citers":4},{"title":"Advances in neural information processing systems , volume=","work_id":"12f5a236-ef7a-4d13-b4de-b51465a6f977","shared_citers":4},{"title":"Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them","work_id":"513eb205-04ca-4722-9a43-a74e8cbe7e85","shared_citers":4},{"title":"Constitutional AI: Harmlessness from AI Feedback","work_id":"faaaa4e0-2676-4fac-a0b4-99aef10d2095","shared_citers":4},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":4},{"title":"Instruction-Following Evaluation for Large Language Models","work_id":"3aa06177-125a-4f5a-8f4a-8070c5986c26","shared_citers":4},{"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","work_id":"ea9e51ce-1e75-4182-92d8-4d25f70d2ee4","shared_citers":4}],"time_series":[{"n":2,"year":2024},{"n":4,"year":2025},{"n":31,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-15T05:17:44.230531+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-15T05:17:55.758137+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Advances in neural information processing systems , volume=","claims":[{"claim_text":"Appendix B), which normalizes rewards within the G rollouts of each task into group-relative advantages. Utilization and query.The action tokens a1:T are conditioned on (xi, zi) and optimized by the task outcome Rutil i =r(τ i). The query qi precedes the actions in the same sequence and receives gradients through the same objective: J util(θ) =J GRPO θ;{τ 1, . . . , τG},{ ˆA1, . . . , ˆAG} \u0001 .(8) Re-ranking.The permutation σi is generated conditioned on the task xi and retrieved candidates Bi K,","claim_type":"background","confidence":0.6,"evidence_strength":"citation_context"}],"why_cited":"Pith tracks Advances in neural information processing systems , volume= because it crossed a citation-hub threshold. Current citing contexts most often use it as background evidence (1 contexts).","role_counts":[{"n":1,"context_role":"background"}]},"error":null,"updated_at":"2026-05-15T05:17:57.587883+00:00"}},"summary":{"title":"Advances in neural information processing systems , volume=","claims":[{"claim_text":"Appendix B), which normalizes rewards within the G rollouts of each task into group-relative advantages. Utilization and query.The action tokens a1:T are conditioned on (xi, zi) and optimized by the task outcome Rutil i =r(τ i). The query qi precedes the actions in the same sequence and receives gradients through the same objective: J util(θ) =J GRPO θ;{τ 1, . . . , τG},{ ˆA1, . . . , ˆAG} \u0001 .(8) Re-ranking.The permutation σi is generated conditioned on the task xi and retrieved candidates Bi K,","claim_type":"background","confidence":0.6,"evidence_strength":"citation_context"}],"why_cited":"Pith tracks Advances in neural information processing systems , volume= because it crossed a citation-hub threshold. Current citing contexts most often use it as background evidence (1 contexts).","role_counts":[{"n":1,"context_role":"background"}]},"graph":{"co_cited":[{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":16},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":14},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":13},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":12},{"title":"Advances in neural information processing systems , volume=","work_id":"b0092220-e738-45d3-97b1-bd1f0bd834c5","shared_citers":9},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":9},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":8},{"title":"Advances in Neural Information Processing Systems , volume=","work_id":"be2b69de-45c4-4db5-ab23-0bff300c6059","shared_citers":7},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":7},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":7},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":7},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":6},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":6},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":6},{"title":"Advances in neural information processing systems , volume=","work_id":"4a77e424-cf16-4c16-8d94-2ee44db893d1","shared_citers":5},{"title":"Machine learning , volume=","work_id":"c1d54d34-8857-4268-be1f-002a7c436d05","shared_citers":5},{"title":"Tulu 3: Pushing Frontiers in Open Language Model Post-Training","work_id":"28c9dbea-056a-48c2-8000-85f809827e45","shared_citers":5},{"title":"Advances in neural information processing systems , volume=","work_id":"1265447d-0324-4d07-abba-34fa29d172da","shared_citers":4},{"title":"Advances in neural information processing systems , volume=","work_id":"12f5a236-ef7a-4d13-b4de-b51465a6f977","shared_citers":4},{"title":"Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them","work_id":"513eb205-04ca-4722-9a43-a74e8cbe7e85","shared_citers":4},{"title":"Constitutional AI: Harmlessness from AI Feedback","work_id":"faaaa4e0-2676-4fac-a0b4-99aef10d2095","shared_citers":4},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":4},{"title":"Instruction-Following Evaluation for Large Language Models","work_id":"3aa06177-125a-4f5a-8f4a-8070c5986c26","shared_citers":4},{"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","work_id":"ea9e51ce-1e75-4182-92d8-4d25f70d2ee4","shared_citers":4}],"time_series":[{"n":2,"year":2024},{"n":4,"year":2025},{"n":31,"year":2026}],"dependency_candidates":[]},"authors":[]}}