{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2022:LTD5JGSW5SGCH6DSSE6TIU5EVG","short_pith_number":"pith:LTD5JGSW","canonical_record":{"source":{"id":"2208.06193","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2022-08-12T09:54:11Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"5c66bb80b0bf6538678b8e75cbc03713f00709e3f26ff15317d1935b77f5d513","abstract_canon_sha256":"4b74d562177fd8b0c31e5b6a33eb4e5c14f4960cf8ce5ea86609f570c540a20a"},"schema_version":"1.0"},"canonical_sha256":"5cc7d49a56ec8c23f872913d3453a4a9a17348507309618a3767cd4275dc79e5","source":{"kind":"arxiv","id":"2208.06193","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2208.06193","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2208.06193v3","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2208.06193","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"LTD5JGSW5SGC","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"LTD5JGSW5SGCH6DS","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"LTD5JGSW","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2022:LTD5JGSW5SGCH6DSSE6TIU5EVG","target":"record","payload":{"canonical_record":{"source":{"id":"2208.06193","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2022-08-12T09:54:11Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"5c66bb80b0bf6538678b8e75cbc03713f00709e3f26ff15317d1935b77f5d513","abstract_canon_sha256":"4b74d562177fd8b0c31e5b6a33eb4e5c14f4960cf8ce5ea86609f570c540a20a"},"schema_version":"1.0"},"canonical_sha256":"5cc7d49a56ec8c23f872913d3453a4a9a17348507309618a3767cd4275dc79e5","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.085014Z","signature_b64":"ByYk6hEPMKDWzN7XxblwxTIZznrPpCkTBbluqH+QFntSEPkOqze0YnL/zj3IEh5W3GmV+MfogV0tRm+f9duWBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5cc7d49a56ec8c23f872913d3453a4a9a17348507309618a3767cd4275dc79e5","last_reissued_at":"2026-05-17T23:38:53.084373Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.084373Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2208.06193","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"W7iHcLOvyXT0ZdXxn82Whzry6un+70qXdiuNnwVvjPfzDLg79tNg7zwXJh/4+LxqVQQAyCXnWaHC/XePL2deBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T17:54:52.300765Z"},"content_sha256":"22fd182496708fc94916536a389affec4a6cc0d30df485c871cb437ac4b54743","schema_version":"1.0","event_id":"sha256:22fd182496708fc94916536a389affec4a6cc0d30df485c871cb437ac4b54743"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2022:LTD5JGSW5SGCH6DSSE6TIU5EVG","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Diffusion Policies as an Expressive Policy Class for Offline Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Diffusion models represent policies in a way that lets offline RL reach state-of-the-art on most D4RL tasks.","cross_cats":["stat.ML"],"primary_cat":"cs.LG","authors_text":"Jonathan J Hunt, Mingyuan Zhou, Zhendong Wang","submitted_at":"2022-08-12T09:54:11Z","abstract_excerpt":"Offline reinforcement learning (RL), which aims to learn an optimal policy using a previously collected static dataset, is an important paradigm of RL. Standard RL methods often perform poorly in this regime due to the function approximation errors on out-of-distribution actions. While a variety of regularization methods have been proposed to mitigate this issue, they are often constrained by policy classes with limited expressiveness that can lead to highly suboptimal solutions. In this paper, we propose representing the policy as a diffusion model, a recent class of highly-expressive deep ge"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We show that our method can achieve state-of-the-art performance on the majority of the D4RL benchmark tasks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the added action-value term in the diffusion training loss reliably produces policy improvement without destabilizing the generative model or causing mode collapse on real datasets.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Diffusion-QL uses conditional diffusion models as expressive policies in offline RL by coupling behavior cloning with Q-value maximization, achieving SOTA on most D4RL tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Diffusion models represent policies in a way that lets offline RL reach state-of-the-art on most D4RL tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"117381b82a67cc81c9d9ae17f5d399cbaecda637f87af283226394bf0a5cd16c"},"source":{"id":"2208.06193","kind":"arxiv","version":3},"verdict":{"id":"2bb6a9b8-2ea6-46a0-898c-93289a9b5100","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T07:48:52.471767Z","strongest_claim":"We show that our method can achieve state-of-the-art performance on the majority of the D4RL benchmark tasks.","one_line_summary":"Diffusion-QL uses conditional diffusion models as expressive policies in offline RL by coupling behavior cloning with Q-value maximization, achieving SOTA on most D4RL tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the added action-value term in the diffusion training loss reliably produces policy improvement without destabilizing the generative model or causing mode collapse on real datasets.","pith_extraction_headline":"Diffusion models represent policies in a way that lets offline RL reach state-of-the-art on most D4RL tasks."},"references":{"count":29,"sample":[{"doi":"","year":null,"title":"Is Conditional Generative Modeling all you need for Decision-Making?","work_id":"dac365c0-e557-4886-9a1b-179151a66160","ref_index":1,"cited_arxiv_id":"2211.15657","is_internal_anchor":true},{"doi":"","year":2004,"title":"D4RL: Datasets for Deep Data-Driven Reinforcement Learning","work_id":"47082e4e-a4a5-418b-bf4f-4667355065fc","ref_index":2,"cited_arxiv_id":"2004.07219","is_internal_anchor":true},{"doi":"","year":2052,"title":"Off-policy deep reinforcement learning without exploration","work_id":"a9dfa48c-bb2c-42e5-9b2c-49d7cfd660b5","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Know your boundaries: The necessity of explicit behavioral cloning in ofﬂine rl","work_id":"3d58821a-164f-4ec7-8c61-caf3a896a1fb","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Planning with Diffusion for Flexible Behavior Synthesis","work_id":"38b2c635-b754-412a-a8f5-dfcf3e405c95","ref_index":5,"cited_arxiv_id":"2205.09991","is_internal_anchor":true}],"resolved_work":29,"snapshot_sha256":"ab76a0a0e5ee32222574c1a18b3a86bd99e3b0180669f66689a328b948d4f41c","internal_anchors":12},"formal_canon":{"evidence_count":2,"snapshot_sha256":"d266cbe545f0cb01be80a29a9d6fa745e5e4f4f34bdf062aa31bfab394cf029d"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"2bb6a9b8-2ea6-46a0-898c-93289a9b5100"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"4GwE4hzKaQK+Y49YZn+Ir1c34N6farPfnGNYdONkBD7iQYGv7p0NPxzeafRiQ0Adfha9FOeq+jVFEPIjIrT9CA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T17:54:52.302064Z"},"content_sha256":"e945f8e757ac62e1ee67981810ae01be3e7220ac00e57255ec0b347e288760e2","schema_version":"1.0","event_id":"sha256:e945f8e757ac62e1ee67981810ae01be3e7220ac00e57255ec0b347e288760e2"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/LTD5JGSW5SGCH6DSSE6TIU5EVG/bundle.json","state_url":"https://pith.science/pith/LTD5JGSW5SGCH6DSSE6TIU5EVG/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/LTD5JGSW5SGCH6DSSE6TIU5EVG/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T17:54:52Z","links":{"resolver":"https://pith.science/pith/LTD5JGSW5SGCH6DSSE6TIU5EVG","bundle":"https://pith.science/pith/LTD5JGSW5SGCH6DSSE6TIU5EVG/bundle.json","state":"https://pith.science/pith/LTD5JGSW5SGCH6DSSE6TIU5EVG/state.json","well_known_bundle":"https://pith.science/.well-known/pith/LTD5JGSW5SGCH6DSSE6TIU5EVG/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2022:LTD5JGSW5SGCH6DSSE6TIU5EVG","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"4b74d562177fd8b0c31e5b6a33eb4e5c14f4960cf8ce5ea86609f570c540a20a","cross_cats_sorted":["stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2022-08-12T09:54:11Z","title_canon_sha256":"5c66bb80b0bf6538678b8e75cbc03713f00709e3f26ff15317d1935b77f5d513"},"schema_version":"1.0","source":{"id":"2208.06193","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2208.06193","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2208.06193v3","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2208.06193","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"LTD5JGSW5SGC","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"LTD5JGSW5SGCH6DS","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"LTD5JGSW","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:e945f8e757ac62e1ee67981810ae01be3e7220ac00e57255ec0b347e288760e2","target":"graph","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We show that our method can achieve state-of-the-art performance on the majority of the D4RL benchmark tasks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the added action-value term in the diffusion training loss reliably produces policy improvement without destabilizing the generative model or causing mode collapse on real datasets."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Diffusion-QL uses conditional diffusion models as expressive policies in offline RL by coupling behavior cloning with Q-value maximization, achieving SOTA on most D4RL tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Diffusion models represent policies in a way that lets offline RL reach state-of-the-art on most D4RL tasks."}],"snapshot_sha256":"117381b82a67cc81c9d9ae17f5d399cbaecda637f87af283226394bf0a5cd16c"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"d266cbe545f0cb01be80a29a9d6fa745e5e4f4f34bdf062aa31bfab394cf029d"},"paper":{"abstract_excerpt":"Offline reinforcement learning (RL), which aims to learn an optimal policy using a previously collected static dataset, is an important paradigm of RL. Standard RL methods often perform poorly in this regime due to the function approximation errors on out-of-distribution actions. While a variety of regularization methods have been proposed to mitigate this issue, they are often constrained by policy classes with limited expressiveness that can lead to highly suboptimal solutions. In this paper, we propose representing the policy as a diffusion model, a recent class of highly-expressive deep ge","authors_text":"Jonathan J Hunt, Mingyuan Zhou, Zhendong Wang","cross_cats":["stat.ML"],"headline":"Diffusion models represent policies in a way that lets offline RL reach state-of-the-art on most D4RL tasks.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2022-08-12T09:54:11Z","title":"Diffusion Policies as an Expressive Policy Class for Offline Reinforcement Learning"},"references":{"count":29,"internal_anchors":12,"resolved_work":29,"sample":[{"cited_arxiv_id":"2211.15657","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Is Conditional Generative Modeling all you need for Decision-Making?","work_id":"dac365c0-e557-4886-9a1b-179151a66160","year":null},{"cited_arxiv_id":"2004.07219","doi":"","is_internal_anchor":true,"ref_index":2,"title":"D4RL: Datasets for Deep Data-Driven Reinforcement Learning","work_id":"47082e4e-a4a5-418b-bf4f-4667355065fc","year":2004},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Off-policy deep reinforcement learning without exploration","work_id":"a9dfa48c-bb2c-42e5-9b2c-49d7cfd660b5","year":2052},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Know your boundaries: The necessity of explicit behavioral cloning in ofﬂine rl","work_id":"3d58821a-164f-4ec7-8c61-caf3a896a1fb","year":null},{"cited_arxiv_id":"2205.09991","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Planning with Diffusion for Flexible Behavior Synthesis","work_id":"38b2c635-b754-412a-a8f5-dfcf3e405c95","year":null}],"snapshot_sha256":"ab76a0a0e5ee32222574c1a18b3a86bd99e3b0180669f66689a328b948d4f41c"},"source":{"id":"2208.06193","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-15T07:48:52.471767Z","id":"2bb6a9b8-2ea6-46a0-898c-93289a9b5100","model_set":{"reader":"grok-4.3"},"one_line_summary":"Diffusion-QL uses conditional diffusion models as expressive policies in offline RL by coupling behavior cloning with Q-value maximization, achieving SOTA on most D4RL tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Diffusion models represent policies in a way that lets offline RL reach state-of-the-art on most D4RL tasks.","strongest_claim":"We show that our method can achieve state-of-the-art performance on the majority of the D4RL benchmark tasks.","weakest_assumption":"That the added action-value term in the diffusion training loss reliably produces policy improvement without destabilizing the generative model or causing mode collapse on real datasets."}},"verdict_id":"2bb6a9b8-2ea6-46a0-898c-93289a9b5100"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:22fd182496708fc94916536a389affec4a6cc0d30df485c871cb437ac4b54743","target":"record","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"4b74d562177fd8b0c31e5b6a33eb4e5c14f4960cf8ce5ea86609f570c540a20a","cross_cats_sorted":["stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2022-08-12T09:54:11Z","title_canon_sha256":"5c66bb80b0bf6538678b8e75cbc03713f00709e3f26ff15317d1935b77f5d513"},"schema_version":"1.0","source":{"id":"2208.06193","kind":"arxiv","version":3}},"canonical_sha256":"5cc7d49a56ec8c23f872913d3453a4a9a17348507309618a3767cd4275dc79e5","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"5cc7d49a56ec8c23f872913d3453a4a9a17348507309618a3767cd4275dc79e5","first_computed_at":"2026-05-17T23:38:53.084373Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:53.084373Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"ByYk6hEPMKDWzN7XxblwxTIZznrPpCkTBbluqH+QFntSEPkOqze0YnL/zj3IEh5W3GmV+MfogV0tRm+f9duWBw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:53.085014Z","signed_message":"canonical_sha256_bytes"},"source_id":"2208.06193","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:22fd182496708fc94916536a389affec4a6cc0d30df485c871cb437ac4b54743","sha256:e945f8e757ac62e1ee67981810ae01be3e7220ac00e57255ec0b347e288760e2"],"state_sha256":"6901e44ec85152f2ae404a7fe87c66b8acc287a75668aa46f2b36816afca5f64"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"He5ajpm5OSA5v+cSXWPKaKALQKR0nGOxzcUol8PWr13zX9ilX+Oy1sJ2ku6aCwGRW5TGGHBqTNAvJA0pCUv1Bg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T17:54:52.306824Z","bundle_sha256":"e9ee69cf1e8f2663ed5d900407c581ee534c104eaf1d995dfd12c39a6c33231e"}}