{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:FPDZ72QBZQE26KWERZH6FTUZ2N","short_pith_number":"pith:FPDZ72QB","schema_version":"1.0","canonical_sha256":"2bc79fea01cc09af2ac48e4fe2ce99d35659314a765873ddd405927a2aa38e2e","source":{"kind":"arxiv","id":"2309.17400","version":2},"attestation_state":"computed","paper":{"title":"Directly Fine-Tuning Diffusion Models on Differentiable Rewards","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Diffusion models can be fine-tuned directly on differentiable rewards by backpropagating gradients through the full sampling process.","cross_cats":["cs.LG"],"primary_cat":"cs.CV","authors_text":"David J Fleet, Kevin Clark, Kevin Swersky, Paul Vicol","submitted_at":"2023-09-29T17:01:02Z","abstract_excerpt":"We present Direct Reward Fine-Tuning (DRaFT), a simple and effective method for fine-tuning diffusion models to maximize differentiable reward functions, such as scores from human preference models. We first show that it is possible to backpropagate the reward function gradient through the full sampling procedure, and that doing so achieves strong performance on a variety of rewards, outperforming reinforcement learning-based approaches. We then propose more efficient variants of DRaFT: DRaFT-K, which truncates backpropagation to only the last K steps of sampling, and DRaFT-LV, which obtains l"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2309.17400","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-09-29T17:01:02Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"2991c385ae42b382a118f4a82555f5882ffacc8d8a6358214db6212e6f3ac194","abstract_canon_sha256":"587acc25088d331ade6829017f847a4b43d8971b06be5caf41cd8c2f78dc6469"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:48.404608Z","signature_b64":"0AYdwb2Ns+cDQxMgNPSD+P/UaAgqC9OsQHSaDxwjAlb/a513Qv/ubybkqhvL9UO72wx7LRbrOZMYAjwJgZhVAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2bc79fea01cc09af2ac48e4fe2ce99d35659314a765873ddd405927a2aa38e2e","last_reissued_at":"2026-05-17T23:38:48.404088Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:48.404088Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Directly Fine-Tuning Diffusion Models on Differentiable Rewards","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Diffusion models can be fine-tuned directly on differentiable rewards by backpropagating gradients through the full sampling process.","cross_cats":["cs.LG"],"primary_cat":"cs.CV","authors_text":"David J Fleet, Kevin Clark, Kevin Swersky, Paul Vicol","submitted_at":"2023-09-29T17:01:02Z","abstract_excerpt":"We present Direct Reward Fine-Tuning (DRaFT), a simple and effective method for fine-tuning diffusion models to maximize differentiable reward functions, such as scores from human preference models. We first show that it is possible to backpropagate the reward function gradient through the full sampling procedure, and that doing so achieves strong performance on a variety of rewards, outperforming reinforcement learning-based approaches. We then propose more efficient variants of DRaFT: DRaFT-K, which truncates backpropagation to only the last K steps of sampling, and DRaFT-LV, which obtains l"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"it is possible to backpropagate the reward function gradient through the full sampling procedure, and that doing so achieves strong performance on a variety of rewards, outperforming reinforcement learning-based approaches.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The reward function must be differentiable with respect to the generated samples, and the sampling process must allow stable gradient flow without excessive variance or memory issues.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"DRaFT fine-tunes diffusion models by differentiating through sampling to maximize rewards, outperforming RL baselines and improving aesthetics on Stable Diffusion 1.4.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Diffusion models can be fine-tuned directly on differentiable rewards by backpropagating gradients through the full sampling process.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"76d1f39cd3f96955bacd3e92e14d5aa91b21053a72263177b9254b134498e511"},"source":{"id":"2309.17400","kind":"arxiv","version":2},"verdict":{"id":"3c502d07-8b04-4296-893c-d3ddb0fb3834","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T09:07:34.450537Z","strongest_claim":"it is possible to backpropagate the reward function gradient through the full sampling procedure, and that doing so achieves strong performance on a variety of rewards, outperforming reinforcement learning-based approaches.","one_line_summary":"DRaFT fine-tunes diffusion models by differentiating through sampling to maximize rewards, outperforming RL baselines and improving aesthetics on Stable Diffusion 1.4.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The reward function must be differentiable with respect to the generated samples, and the sampling process must allow stable gradient flow without excessive variance or memory issues.","pith_extraction_headline":"Diffusion models can be fine-tuned directly on differentiable rewards by backpropagating gradients through the full sampling process."},"references":{"count":38,"sample":[{"doi":"","year":null,"title":"A General Language Assistant as a Laboratory for Alignment","work_id":"a43f9ea0-01be-47d5-b8ee-a1a9f73381c5","ref_index":1,"cited_arxiv_id":"2112.00861","is_internal_anchor":true},{"doi":"","year":null,"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","ref_index":2,"cited_arxiv_id":"2204.05862","is_internal_anchor":true},{"doi":"","year":null,"title":"Training Diffusion Models with Reinforcement Learning","work_id":"67684dda-3930-452a-b91a-36cbb8e2e219","ref_index":3,"cited_arxiv_id":"2305.13301","is_internal_anchor":true},{"doi":"","year":null,"title":"Training Deep Nets with Sublinear Memory Cost","work_id":"f2c5c287-a500-40e4-a136-e7e3172db1d7","ref_index":4,"cited_arxiv_id":"1604.06174","is_internal_anchor":true},{"doi":"","year":null,"title":"Microsoft COCO Captions: Data Collection and Evaluation Server","work_id":"b3d6fb46-4169-4a28-8f7e-2ca6774211da","ref_index":5,"cited_arxiv_id":"1504.00325","is_internal_anchor":true}],"resolved_work":38,"snapshot_sha256":"d5b7ffae4c3f191538de78d970ba5218df394fe688af195c965ef5a416ecc066","internal_anchors":13},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2309.17400","created_at":"2026-05-17T23:38:48.404175+00:00"},{"alias_kind":"arxiv_version","alias_value":"2309.17400v2","created_at":"2026-05-17T23:38:48.404175+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2309.17400","created_at":"2026-05-17T23:38:48.404175+00:00"},{"alias_kind":"pith_short_12","alias_value":"FPDZ72QBZQE2","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"FPDZ72QBZQE26KWE","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"FPDZ72QB","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":29,"internal_anchor_count":29,"sample":[{"citing_arxiv_id":"2412.15689","citing_title":"DOLLAR: Few-Step Video Generation via Distillation and Latent Reward Optimization","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2602.10933","citing_title":"CMAD: Cooperative Multi-Agent Diffusion via Stochastic Optimal Control","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27147","citing_title":"How to Guide Your Flow: Few-Step Alignment via Flow Map Reward Guidance","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15803","citing_title":"Embedding-perturbed Exploration Preference Optimization for Flow Models","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19839","citing_title":"When Preference Labels Fall Short: Aligning Diffusion Models from Real Data","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2507.08390","citing_title":"Inference-Time Scaling of Diffusion Language Models via Trajectory Refinement","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2510.06637","citing_title":"Control-Augmented Autoregressive Diffusion for Data Assimilation","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2512.01236","citing_title":"PSR: Scaling Multi-Subject Personalized Image Generation with Pairwise Subject-Consistency Rewards","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2412.21059","citing_title":"VisionReward: Fine-Grained Multi-Dimensional Human Preference Learning for Image and Video Generation","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2409.00588","citing_title":"Diffusion Policy Policy Optimization","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2602.07069","citing_title":"Bird-SR: Bidirectional Reward-Guided Diffusion for Real-World Image Super-Resolution","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2603.00918","citing_title":"Improving Text-to-Image Generation with Intrinsic Self-Confidence Rewards","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12650","citing_title":"CRAFT: Clinical Reward-Aligned Finetuning for Medical Image Synthesis","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2501.13918","citing_title":"Improving Video Generation with Human Feedback","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06779","citing_title":"VASR: Variance-Aware Systematic Resampling for Reward-Guided Diffusion","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27147","citing_title":"How to Guide Your Flow: Few-Step Alignment via Flow Map Reward Guidance","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10937","citing_title":"Power Reinforcement Post-Training of Text-to-Image Models with Super-Linear Advantage Shaping","ref_index":78,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09433","citing_title":"Offline Preference Optimization for Rectified Flow with Noise-Tracked Pairs","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06583","citing_title":"Improved techniques for fine-tuning flow models via adjoint matching: a deterministic control pipeline","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2505.05470","citing_title":"Flow-GRPO: Training Flow Matching Models via Online RL","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2605.04494","citing_title":"Towards General Preference Alignment: Diffusion Models at Nash Equilibrium","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18258","citing_title":"Long-Text-to-Image Generation via Compositional Prompt Decomposition","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13305","citing_title":"Bias at the End of the Score","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06779","citing_title":"VASR: Variance-Aware Systematic Resampling for Reward-Guided Diffusion","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06916","citing_title":"FP4 Explore, BF16 Train: Diffusion Reinforcement Learning via Efficient Rollout Scaling","ref_index":29,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/FPDZ72QBZQE26KWERZH6FTUZ2N","json":"https://pith.science/pith/FPDZ72QBZQE26KWERZH6FTUZ2N.json","graph_json":"https://pith.science/api/pith-number/FPDZ72QBZQE26KWERZH6FTUZ2N/graph.json","events_json":"https://pith.science/api/pith-number/FPDZ72QBZQE26KWERZH6FTUZ2N/events.json","paper":"https://pith.science/paper/FPDZ72QB"},"agent_actions":{"view_html":"https://pith.science/pith/FPDZ72QBZQE26KWERZH6FTUZ2N","download_json":"https://pith.science/pith/FPDZ72QBZQE26KWERZH6FTUZ2N.json","view_paper":"https://pith.science/paper/FPDZ72QB","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2309.17400&json=true","fetch_graph":"https://pith.science/api/pith-number/FPDZ72QBZQE26KWERZH6FTUZ2N/graph.json","fetch_events":"https://pith.science/api/pith-number/FPDZ72QBZQE26KWERZH6FTUZ2N/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/FPDZ72QBZQE26KWERZH6FTUZ2N/action/timestamp_anchor","attest_storage":"https://pith.science/pith/FPDZ72QBZQE26KWERZH6FTUZ2N/action/storage_attestation","attest_author":"https://pith.science/pith/FPDZ72QBZQE26KWERZH6FTUZ2N/action/author_attestation","sign_citation":"https://pith.science/pith/FPDZ72QBZQE26KWERZH6FTUZ2N/action/citation_signature","submit_replication":"https://pith.science/pith/FPDZ72QBZQE26KWERZH6FTUZ2N/action/replication_record"}},"created_at":"2026-05-17T23:38:48.404175+00:00","updated_at":"2026-05-17T23:38:48.404175+00:00"}