{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:CRXVDAUCS5WRTEBA6LK6DKUUHH","short_pith_number":"pith:CRXVDAUC","schema_version":"1.0","canonical_sha256":"146f518282976d199020f2d5e1aa9439ed848d31a88ef701a12920ac0875e5c5","source":{"kind":"arxiv","id":"2602.01058","version":2},"attestation_state":"computed","paper":{"title":"Good SFT Optimizes for SFT, Better SFT Prepares for Reinforcement Learning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Dylan Zhang, Haojin Wang, Hao Peng, Qingzhi Chen, Yufeng Xu","submitted_at":"2026-02-01T06:53:45Z","abstract_excerpt":"Post-training of reasoning LLMs is a holistic process that typically consists of an offline SFT stage followed by an online reinforcement learning (RL) stage. However, SFT is often optimized in isolation to maximize SFT performance alone.\n  We show that, after identical RL training, models initialized from stronger SFT checkpoints can significantly underperform those initialized from weaker ones. We attribute this to a mismatch typical in current SFT-RL pipelines: the distribution that generates the offline SFT data can differ substantially from the policy optimized during online RL, which lea"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2602.01058","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-02-01T06:53:45Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"95b1798664bb53f843d3b7325588847518f5e782a49c5da424cb015d7863f8d8","abstract_canon_sha256":"69ec8a600551900182b9112d7e90428cbbe87afe453d25e6b493a7349e229de6"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-29T01:05:03.090229Z","signature_b64":"2rp40GmCKmKcfvJZqMuiKxBl+gRdMH+tIliv5vpHwXan66rVvIBygYuTz+5VOojTvh0R2//ncgoZr69OSwieAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"146f518282976d199020f2d5e1aa9439ed848d31a88ef701a12920ac0875e5c5","last_reissued_at":"2026-05-29T01:05:03.089153Z","signature_status":"signed_v1","first_computed_at":"2026-05-29T01:05:03.089153Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Good SFT Optimizes for SFT, Better SFT Prepares for Reinforcement Learning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Dylan Zhang, Haojin Wang, Hao Peng, Qingzhi Chen, Yufeng Xu","submitted_at":"2026-02-01T06:53:45Z","abstract_excerpt":"Post-training of reasoning LLMs is a holistic process that typically consists of an offline SFT stage followed by an online reinforcement learning (RL) stage. However, SFT is often optimized in isolation to maximize SFT performance alone.\n  We show that, after identical RL training, models initialized from stronger SFT checkpoints can significantly underperform those initialized from weaker ones. We attribute this to a mismatch typical in current SFT-RL pipelines: the distribution that generates the offline SFT data can differ substantially from the policy optimized during online RL, which lea"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.01058","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.01058/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2602.01058","created_at":"2026-05-29T01:05:03.089305+00:00"},{"alias_kind":"arxiv_version","alias_value":"2602.01058v2","created_at":"2026-05-29T01:05:03.089305+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.01058","created_at":"2026-05-29T01:05:03.089305+00:00"},{"alias_kind":"pith_short_12","alias_value":"CRXVDAUCS5WR","created_at":"2026-05-29T01:05:03.089305+00:00"},{"alias_kind":"pith_short_16","alias_value":"CRXVDAUCS5WRTEBA","created_at":"2026-05-29T01:05:03.089305+00:00"},{"alias_kind":"pith_short_8","alias_value":"CRXVDAUC","created_at":"2026-05-29T01:05:03.089305+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2602.13934","citing_title":"Why Code, Why Now: An Information-Theoretic Perspective on the Limits of Machine Learning","ref_index":48,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/CRXVDAUCS5WRTEBA6LK6DKUUHH","json":"https://pith.science/pith/CRXVDAUCS5WRTEBA6LK6DKUUHH.json","graph_json":"https://pith.science/api/pith-number/CRXVDAUCS5WRTEBA6LK6DKUUHH/graph.json","events_json":"https://pith.science/api/pith-number/CRXVDAUCS5WRTEBA6LK6DKUUHH/events.json","paper":"https://pith.science/paper/CRXVDAUC"},"agent_actions":{"view_html":"https://pith.science/pith/CRXVDAUCS5WRTEBA6LK6DKUUHH","download_json":"https://pith.science/pith/CRXVDAUCS5WRTEBA6LK6DKUUHH.json","view_paper":"https://pith.science/paper/CRXVDAUC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2602.01058&json=true","fetch_graph":"https://pith.science/api/pith-number/CRXVDAUCS5WRTEBA6LK6DKUUHH/graph.json","fetch_events":"https://pith.science/api/pith-number/CRXVDAUCS5WRTEBA6LK6DKUUHH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/CRXVDAUCS5WRTEBA6LK6DKUUHH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/CRXVDAUCS5WRTEBA6LK6DKUUHH/action/storage_attestation","attest_author":"https://pith.science/pith/CRXVDAUCS5WRTEBA6LK6DKUUHH/action/author_attestation","sign_citation":"https://pith.science/pith/CRXVDAUCS5WRTEBA6LK6DKUUHH/action/citation_signature","submit_replication":"https://pith.science/pith/CRXVDAUCS5WRTEBA6LK6DKUUHH/action/replication_record"}},"created_at":"2026-05-29T01:05:03.089305+00:00","updated_at":"2026-05-29T01:05:03.089305+00:00"}