{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:4CBGIT3DTRJ4OSJTIKD5RLVKYP","short_pith_number":"pith:4CBGIT3D","schema_version":"1.0","canonical_sha256":"e082644f639c53c749334287d8aeaac3fe23604b9d09cf29130161f2f0868101","source":{"kind":"arxiv","id":"2406.18629","version":1},"attestation_state":"computed","paper":{"title":"Step-DPO: Step-wise Preference Optimization for Long-chain Reasoning of LLMs","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Jiaya Jia, Senqiao Yang, Xiangru Peng, Xin Lai, Yukang Chen, Zhuotao Tian","submitted_at":"2024-06-26T17:43:06Z","abstract_excerpt":"Mathematical reasoning presents a significant challenge for Large Language Models (LLMs) due to the extensive and precise chain of reasoning required for accuracy. Ensuring the correctness of each reasoning step is critical. To address this, we aim to enhance the robustness and factuality of LLMs by learning from human feedback. However, Direct Preference Optimization (DPO) has shown limited benefits for long-chain mathematical reasoning, as models employing DPO struggle to identify detailed errors in incorrect answers. This limitation stems from a lack of fine-grained process supervision. We "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2406.18629","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.LG","submitted_at":"2024-06-26T17:43:06Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"3667cac0c5711d844d8a067790368dd9a16a04a1bb68844fa92baebc08c4f174","abstract_canon_sha256":"e34a91beb2e2e155022b324febc212b54dc86d771a45391f2d4bdb223aba9e8f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T23:52:39.417639Z","signature_b64":"qkRzdaQdyxah0ln/iAcGZgJlaKdvBM9NhR5P09814uDbAHQW/PGthaemHD/g6arILYk7XAK9u26P21H4o2T3CA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e082644f639c53c749334287d8aeaac3fe23604b9d09cf29130161f2f0868101","last_reissued_at":"2026-05-18T23:52:39.414688Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T23:52:39.414688Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Step-DPO: Step-wise Preference Optimization for Long-chain Reasoning of LLMs","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Jiaya Jia, Senqiao Yang, Xiangru Peng, Xin Lai, Yukang Chen, Zhuotao Tian","submitted_at":"2024-06-26T17:43:06Z","abstract_excerpt":"Mathematical reasoning presents a significant challenge for Large Language Models (LLMs) due to the extensive and precise chain of reasoning required for accuracy. Ensuring the correctness of each reasoning step is critical. To address this, we aim to enhance the robustness and factuality of LLMs by learning from human feedback. However, Direct Preference Optimization (DPO) has shown limited benefits for long-chain mathematical reasoning, as models employing DPO struggle to identify detailed errors in incorrect answers. This limitation stems from a lack of fine-grained process supervision. We "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2406.18629","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2406.18629","created_at":"2026-05-18T23:52:39.414823+00:00"},{"alias_kind":"arxiv_version","alias_value":"2406.18629v1","created_at":"2026-05-18T23:52:39.414823+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.18629","created_at":"2026-05-18T23:52:39.414823+00:00"},{"alias_kind":"pith_short_12","alias_value":"4CBGIT3DTRJ4","created_at":"2026-05-18T23:52:39.414823+00:00"},{"alias_kind":"pith_short_16","alias_value":"4CBGIT3DTRJ4OSJT","created_at":"2026-05-18T23:52:39.414823+00:00"},{"alias_kind":"pith_short_8","alias_value":"4CBGIT3D","created_at":"2026-05-18T23:52:39.414823+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":20,"internal_anchor_count":20,"sample":[{"citing_arxiv_id":"2507.05179","citing_title":"From Fragments to Facts: A Curriculum-Driven DPO Approach for Generating Hindi News Veracity Explanations","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2508.06412","citing_title":"Sample-efficient LLM Optimization with Reset Replay","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2509.02547","citing_title":"The Landscape of Agentic Reinforcement Learning for LLMs: A Survey","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2509.19893","citing_title":"Future Policy Approximation for Offline Reinforcement Learning Improves Mathematical Reasoning","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2510.07972","citing_title":"SHE: Stepwise Hybrid Examination Reinforcement Learning Framework for E-commerce Search Relevance","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2510.24235","citing_title":"PaTaRM: Bridging Pairwise and Pointwise Signals via Preference-Aware Task-Adaptive Reward Modeling","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2512.19728","citing_title":"Hard Negative Sample-Augmented DPO Post-Training for Small Language Models","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2411.10442","citing_title":"Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2508.07407","citing_title":"A Comprehensive Survey of Self-Evolving AI Agents: A New Paradigm Bridging Foundation Models and Lifelong Agentic Systems","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2504.11536","citing_title":"ReTool: Reinforcement Learning for Strategic Tool Use in LLMs","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11906","citing_title":"YFPO: A Preliminary Study of Yoked Feature Preference Optimization with Neuron-Guided Rewards for Mathematical Reasoning","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2502.17419","citing_title":"From System 1 to System 2: A Survey of Reasoning Large Language Models","ref_index":181,"is_internal_anchor":true},{"citing_arxiv_id":"2502.05171","citing_title":"Scaling up Test-Time Compute with Latent Reasoning: A Recurrent Depth Approach","ref_index":86,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08472","citing_title":"Mid-Training with Self-Generated Data Improves Reinforcement Learning in Language Models","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08094","citing_title":"MedThink: Enhancing Diagnostic Accuracy in Small Models via Teacher-Guided Reasoning Correction","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10805","citing_title":"Reasoning Is Not Free: Robust Adaptive Cost-Efficient Routing for LLM-as-a-Judge","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.02290","citing_title":"Distilling Long-CoT Reasoning through Collaborative Step-wise Multi-Teacher Decoding","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08723","citing_title":"Decomposing the Delta: What Do Models Actually Learn from Preference Pairs?","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05341","citing_title":"Curr-RLCER:Curriculum Reinforcement Learning For Coherence Explainable Recommendation","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.02141","citing_title":"On the Optimal Sample Complexity of Offline Multi-Armed Bandits with KL Regularization","ref_index":29,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4CBGIT3DTRJ4OSJTIKD5RLVKYP","json":"https://pith.science/pith/4CBGIT3DTRJ4OSJTIKD5RLVKYP.json","graph_json":"https://pith.science/api/pith-number/4CBGIT3DTRJ4OSJTIKD5RLVKYP/graph.json","events_json":"https://pith.science/api/pith-number/4CBGIT3DTRJ4OSJTIKD5RLVKYP/events.json","paper":"https://pith.science/paper/4CBGIT3D"},"agent_actions":{"view_html":"https://pith.science/pith/4CBGIT3DTRJ4OSJTIKD5RLVKYP","download_json":"https://pith.science/pith/4CBGIT3DTRJ4OSJTIKD5RLVKYP.json","view_paper":"https://pith.science/paper/4CBGIT3D","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2406.18629&json=true","fetch_graph":"https://pith.science/api/pith-number/4CBGIT3DTRJ4OSJTIKD5RLVKYP/graph.json","fetch_events":"https://pith.science/api/pith-number/4CBGIT3DTRJ4OSJTIKD5RLVKYP/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4CBGIT3DTRJ4OSJTIKD5RLVKYP/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4CBGIT3DTRJ4OSJTIKD5RLVKYP/action/storage_attestation","attest_author":"https://pith.science/pith/4CBGIT3DTRJ4OSJTIKD5RLVKYP/action/author_attestation","sign_citation":"https://pith.science/pith/4CBGIT3DTRJ4OSJTIKD5RLVKYP/action/citation_signature","submit_replication":"https://pith.science/pith/4CBGIT3DTRJ4OSJTIKD5RLVKYP/action/replication_record"}},"created_at":"2026-05-18T23:52:39.414823+00:00","updated_at":"2026-05-18T23:52:39.414823+00:00"}