{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:NQJTNYMIKV2COGVMGH3T2L434C","short_pith_number":"pith:NQJTNYMI","canonical_record":{"source":{"id":"2407.16216","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-07-23T06:45:52Z","cross_cats_sorted":[],"title_canon_sha256":"bfec645c6a4739bb121af3462e0aafbe5a72d5ae773b578fbab24860c6d8d48c","abstract_canon_sha256":"75f29f80c71bf34cb9ade4953861a2bc9667cce7a8189fa8eb6a8cc0e6c3edf2"},"schema_version":"1.0"},"canonical_sha256":"6c1336e1885574271aac31f73d2f9be0be7973e1d05b2fed82916d8eb20ec3db","source":{"kind":"arxiv","id":"2407.16216","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2407.16216","created_at":"2026-05-20T00:02:46Z"},{"alias_kind":"arxiv_version","alias_value":"2407.16216v4","created_at":"2026-05-20T00:02:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2407.16216","created_at":"2026-05-20T00:02:46Z"},{"alias_kind":"pith_short_12","alias_value":"NQJTNYMIKV2C","created_at":"2026-05-20T00:02:46Z"},{"alias_kind":"pith_short_16","alias_value":"NQJTNYMIKV2COGVM","created_at":"2026-05-20T00:02:46Z"},{"alias_kind":"pith_short_8","alias_value":"NQJTNYMI","created_at":"2026-05-20T00:02:46Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:NQJTNYMIKV2COGVMGH3T2L434C","target":"record","payload":{"canonical_record":{"source":{"id":"2407.16216","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-07-23T06:45:52Z","cross_cats_sorted":[],"title_canon_sha256":"bfec645c6a4739bb121af3462e0aafbe5a72d5ae773b578fbab24860c6d8d48c","abstract_canon_sha256":"75f29f80c71bf34cb9ade4953861a2bc9667cce7a8189fa8eb6a8cc0e6c3edf2"},"schema_version":"1.0"},"canonical_sha256":"6c1336e1885574271aac31f73d2f9be0be7973e1d05b2fed82916d8eb20ec3db","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:02:46.591068Z","signature_b64":"NvAjAEZTPIwEJk74RBzanemryi6cyykdmIrRHuicKUAixz+IgM+sFBzoqhIQYbl41jxrdDoGISvnJHh+3bOJDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6c1336e1885574271aac31f73d2f9be0be7973e1d05b2fed82916d8eb20ec3db","last_reissued_at":"2026-05-20T00:02:46.590504Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:02:46.590504Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2407.16216","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:02:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"e9Q09zufIJcXDkLq395Kw6oDDYAir6WE5921cVHppW+hntofi0gHypofd/0fDVahAgKiOlgMj4tzVqkqsAh6Cg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T20:47:23.505767Z"},"content_sha256":"da82c5b8f6f321a7b970d1ebf5409c21f19f86c843d23cdf02e19381fdd818c3","schema_version":"1.0","event_id":"sha256:da82c5b8f6f321a7b970d1ebf5409c21f19f86c843d23cdf02e19381fdd818c3"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:NQJTNYMIKV2COGVMGH3T2L434C","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Reinforcement Learning for LLM Post-Training: A Survey","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Bin Bi, Kiran Ramnath, Na (Claire) Cheng, Shiva Kumar Pentyala, Shubham Mehrotra, Sitaram Asur, Sougata Chaudhuri, Xiang-Bo Mao, Zhichao Wang, Zixu (James) Zhu","submitted_at":"2024-07-23T06:45:52Z","abstract_excerpt":"Large language models (LLMs) trained via pretraining and supervised fine-tuning (SFT) can still produce harmful and misaligned outputs, or struggle in domains like math and coding. Reinforcement learning (RL)-based post-training methods, including Reinforcement Learning from Human Feedback (RLHF) methods like Direct Preference Optimization (DPO) and Reinforcement Learning with Verifiable Rewards (RLVR) approaches like PPO and GRPO, have made remarkable gains to alleviate these issues. Yet, no existing work offers a technically detailed comparison of the various methods driving this progress. I"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2407.16216","kind":"arxiv","version":4},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2407.16216/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:02:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"C7T/L1J83jqER1sBAdgird6AYbnMqujbl3qWaMYirNUKKvl+7+Tv6kL0kwSToCcJHJZGR5qoFjm0oQucdAqzCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T20:47:23.506388Z"},"content_sha256":"b252a03c406eef02ecbe1a950a4931eaf6c1fa016e15918eb2c5ea4ac348485f","schema_version":"1.0","event_id":"sha256:b252a03c406eef02ecbe1a950a4931eaf6c1fa016e15918eb2c5ea4ac348485f"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/NQJTNYMIKV2COGVMGH3T2L434C/bundle.json","state_url":"https://pith.science/pith/NQJTNYMIKV2COGVMGH3T2L434C/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/NQJTNYMIKV2COGVMGH3T2L434C/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-24T20:47:23Z","links":{"resolver":"https://pith.science/pith/NQJTNYMIKV2COGVMGH3T2L434C","bundle":"https://pith.science/pith/NQJTNYMIKV2COGVMGH3T2L434C/bundle.json","state":"https://pith.science/pith/NQJTNYMIKV2COGVMGH3T2L434C/state.json","well_known_bundle":"https://pith.science/.well-known/pith/NQJTNYMIKV2COGVMGH3T2L434C/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:NQJTNYMIKV2COGVMGH3T2L434C","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"75f29f80c71bf34cb9ade4953861a2bc9667cce7a8189fa8eb6a8cc0e6c3edf2","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-07-23T06:45:52Z","title_canon_sha256":"bfec645c6a4739bb121af3462e0aafbe5a72d5ae773b578fbab24860c6d8d48c"},"schema_version":"1.0","source":{"id":"2407.16216","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2407.16216","created_at":"2026-05-20T00:02:46Z"},{"alias_kind":"arxiv_version","alias_value":"2407.16216v4","created_at":"2026-05-20T00:02:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2407.16216","created_at":"2026-05-20T00:02:46Z"},{"alias_kind":"pith_short_12","alias_value":"NQJTNYMIKV2C","created_at":"2026-05-20T00:02:46Z"},{"alias_kind":"pith_short_16","alias_value":"NQJTNYMIKV2COGVM","created_at":"2026-05-20T00:02:46Z"},{"alias_kind":"pith_short_8","alias_value":"NQJTNYMI","created_at":"2026-05-20T00:02:46Z"}],"graph_snapshots":[{"event_id":"sha256:b252a03c406eef02ecbe1a950a4931eaf6c1fa016e15918eb2c5ea4ac348485f","target":"graph","created_at":"2026-05-20T00:02:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2407.16216/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Large language models (LLMs) trained via pretraining and supervised fine-tuning (SFT) can still produce harmful and misaligned outputs, or struggle in domains like math and coding. Reinforcement learning (RL)-based post-training methods, including Reinforcement Learning from Human Feedback (RLHF) methods like Direct Preference Optimization (DPO) and Reinforcement Learning with Verifiable Rewards (RLVR) approaches like PPO and GRPO, have made remarkable gains to alleviate these issues. Yet, no existing work offers a technically detailed comparison of the various methods driving this progress. I","authors_text":"Bin Bi, Kiran Ramnath, Na (Claire) Cheng, Shiva Kumar Pentyala, Shubham Mehrotra, Sitaram Asur, Sougata Chaudhuri, Xiang-Bo Mao, Zhichao Wang, Zixu (James) Zhu","cross_cats":[],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-07-23T06:45:52Z","title":"Reinforcement Learning for LLM Post-Training: A Survey"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2407.16216","kind":"arxiv","version":4},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:da82c5b8f6f321a7b970d1ebf5409c21f19f86c843d23cdf02e19381fdd818c3","target":"record","created_at":"2026-05-20T00:02:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"75f29f80c71bf34cb9ade4953861a2bc9667cce7a8189fa8eb6a8cc0e6c3edf2","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-07-23T06:45:52Z","title_canon_sha256":"bfec645c6a4739bb121af3462e0aafbe5a72d5ae773b578fbab24860c6d8d48c"},"schema_version":"1.0","source":{"id":"2407.16216","kind":"arxiv","version":4}},"canonical_sha256":"6c1336e1885574271aac31f73d2f9be0be7973e1d05b2fed82916d8eb20ec3db","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"6c1336e1885574271aac31f73d2f9be0be7973e1d05b2fed82916d8eb20ec3db","first_computed_at":"2026-05-20T00:02:46.590504Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:02:46.590504Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"NvAjAEZTPIwEJk74RBzanemryi6cyykdmIrRHuicKUAixz+IgM+sFBzoqhIQYbl41jxrdDoGISvnJHh+3bOJDA==","signature_status":"signed_v1","signed_at":"2026-05-20T00:02:46.591068Z","signed_message":"canonical_sha256_bytes"},"source_id":"2407.16216","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:da82c5b8f6f321a7b970d1ebf5409c21f19f86c843d23cdf02e19381fdd818c3","sha256:b252a03c406eef02ecbe1a950a4931eaf6c1fa016e15918eb2c5ea4ac348485f"],"state_sha256":"32c73a6fa0dc70dfa40b62f09c2d4b471d0de57cd7936c8fbd39bd390f94cb6e"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Z2BiDmG0yRrhDObZyjn6CoCR6AqfEVDqgPcVB9n+gWoJDeQenXzDZ15MdNeP+3W22I1MURAyuIdLToAjMJpRCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-24T20:47:23.509642Z","bundle_sha256":"bacbb2567cb77a0d9f909faa1b4cf78bbc14f8adc851ce75d24e9999c76ac28b"}}