{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:ADY4X6ARGB7T42NAT5LO32OAUB","short_pith_number":"pith:ADY4X6AR","schema_version":"1.0","canonical_sha256":"00f1cbf811307f3e69a09f56ede9c0a04d020ccef314e837cc1aaf737f35cfb5","source":{"kind":"arxiv","id":"1812.02648","version":1},"attestation_state":"computed","paper":{"title":"Deep Reinforcement Learning and the Deadly Triad","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.AI","authors_text":"Florian Strub, Hado van Hasselt, Joseph Modayil, Matteo Hessel, Nicolas Sonnerat, Yotam Doron","submitted_at":"2018-12-06T16:36:20Z","abstract_excerpt":"We know from reinforcement learning theory that temporal difference learning can fail in certain cases. Sutton and Barto (2018) identify a deadly triad of function approximation, bootstrapping, and off-policy learning. When these three properties are combined, learning can diverge with the value estimates becoming unbounded. However, several algorithms successfully combine these three properties, which indicates that there is at least a partial gap in our understanding. In this work, we investigate the impact of the deadly triad in practice, in the context of a family of popular deep reinforce"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1812.02648","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2018-12-06T16:36:20Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"aa47f17602f9cbdae55044b4a5ca8d5a0c631d2b03fdfbacf5e29b73391d078e","abstract_canon_sha256":"0580d25c5a5eb04c82d2db5958a7e1206197b628130588ad2d329d0cb2d39963"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:58:55.256462Z","signature_b64":"DwPr1SKbvvHxBURz6TMfImQMujGdwIEg3cnuZWC3b5wEbZB/Trww3AEEkzLPxEJ2BTV1L/ANa597VlGE7tLRBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"00f1cbf811307f3e69a09f56ede9c0a04d020ccef314e837cc1aaf737f35cfb5","last_reissued_at":"2026-05-17T23:58:55.256070Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:58:55.256070Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Deep Reinforcement Learning and the Deadly Triad","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.AI","authors_text":"Florian Strub, Hado van Hasselt, Joseph Modayil, Matteo Hessel, Nicolas Sonnerat, Yotam Doron","submitted_at":"2018-12-06T16:36:20Z","abstract_excerpt":"We know from reinforcement learning theory that temporal difference learning can fail in certain cases. Sutton and Barto (2018) identify a deadly triad of function approximation, bootstrapping, and off-policy learning. When these three properties are combined, learning can diverge with the value estimates becoming unbounded. However, several algorithms successfully combine these three properties, which indicates that there is at least a partial gap in our understanding. In this work, we investigate the impact of the deadly triad in practice, in the context of a family of popular deep reinforce"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1812.02648","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1812.02648","created_at":"2026-05-17T23:58:55.256130+00:00"},{"alias_kind":"arxiv_version","alias_value":"1812.02648v1","created_at":"2026-05-17T23:58:55.256130+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1812.02648","created_at":"2026-05-17T23:58:55.256130+00:00"},{"alias_kind":"pith_short_12","alias_value":"ADY4X6ARGB7T","created_at":"2026-05-18T12:32:13.499390+00:00"},{"alias_kind":"pith_short_16","alias_value":"ADY4X6ARGB7T42NA","created_at":"2026-05-18T12:32:13.499390+00:00"},{"alias_kind":"pith_short_8","alias_value":"ADY4X6AR","created_at":"2026-05-18T12:32:13.499390+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":17,"internal_anchor_count":7,"sample":[{"citing_arxiv_id":"2411.04832","citing_title":"Plasticity Loss in Deep Reinforcement Learning: A Survey","ref_index":105,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21214","citing_title":"Behavior-Consistent Deep Reinforcement Learning","ref_index":99,"is_internal_anchor":true},{"citing_arxiv_id":"2507.00275","citing_title":"Deep Double Q-learning","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2510.02590","citing_title":"Use the Online Network If You Can: Towards Fast and Stable Reinforcement Learning","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21214","citing_title":"Behavior-Consistent Deep Reinforcement Learning","ref_index":99,"is_internal_anchor":true},{"citing_arxiv_id":"2604.04539","citing_title":"FlashSAC: Fast and Stable Off-Policy Reinforcement Learning for High-Dimensional Robot Control","ref_index":87,"is_internal_anchor":true},{"citing_arxiv_id":"2509.08660","citing_title":"Replicable Reinforcement Learning with Linear Function Approximation","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"1911.11361","citing_title":"Behavior Regularized Offline Reinforcement Learning","ref_index":18,"is_internal_anchor":false},{"citing_arxiv_id":"2605.12004","citing_title":"Learning Agentic Policy from Action Guidance","ref_index":58,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05812","citing_title":"Long-Horizon Q-Learning: Accurate Value Learning via n-Step Inequalities","ref_index":45,"is_internal_anchor":false},{"citing_arxiv_id":"2605.06373","citing_title":"Beyond the Independence Assumption: Finite-Sample Guarantees for Deep Q-Learning under $\\tau$-Mixing","ref_index":32,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05812","citing_title":"Long-Horizon Q-Learning: Accurate Value Learning via n-Step Inequalities","ref_index":45,"is_internal_anchor":false},{"citing_arxiv_id":"2604.23056","citing_title":"K-Score: Kalman Filter as a Principled Alternative to Reward Normalization in Reinforcement Learning","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2005.01643","citing_title":"Offline Reinforcement Learning: Tutorial, Review, and Perspectives on Open Problems","ref_index":55,"is_internal_anchor":false},{"citing_arxiv_id":"2605.01968","citing_title":"AdamO: A Collapse-Suppressed Optimizer for Offline RL","ref_index":34,"is_internal_anchor":false},{"citing_arxiv_id":"2605.01862","citing_title":"QHyer: Q-conditioned Hybrid Attention-mamba Transformer for Offline Goal-conditioned RL","ref_index":199,"is_internal_anchor":false},{"citing_arxiv_id":"2604.04539","citing_title":"FlashSAC: Fast and Stable Off-Policy Reinforcement Learning for High-Dimensional Robot Control","ref_index":87,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ADY4X6ARGB7T42NAT5LO32OAUB","json":"https://pith.science/pith/ADY4X6ARGB7T42NAT5LO32OAUB.json","graph_json":"https://pith.science/api/pith-number/ADY4X6ARGB7T42NAT5LO32OAUB/graph.json","events_json":"https://pith.science/api/pith-number/ADY4X6ARGB7T42NAT5LO32OAUB/events.json","paper":"https://pith.science/paper/ADY4X6AR"},"agent_actions":{"view_html":"https://pith.science/pith/ADY4X6ARGB7T42NAT5LO32OAUB","download_json":"https://pith.science/pith/ADY4X6ARGB7T42NAT5LO32OAUB.json","view_paper":"https://pith.science/paper/ADY4X6AR","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1812.02648&json=true","fetch_graph":"https://pith.science/api/pith-number/ADY4X6ARGB7T42NAT5LO32OAUB/graph.json","fetch_events":"https://pith.science/api/pith-number/ADY4X6ARGB7T42NAT5LO32OAUB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ADY4X6ARGB7T42NAT5LO32OAUB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ADY4X6ARGB7T42NAT5LO32OAUB/action/storage_attestation","attest_author":"https://pith.science/pith/ADY4X6ARGB7T42NAT5LO32OAUB/action/author_attestation","sign_citation":"https://pith.science/pith/ADY4X6ARGB7T42NAT5LO32OAUB/action/citation_signature","submit_replication":"https://pith.science/pith/ADY4X6ARGB7T42NAT5LO32OAUB/action/replication_record"}},"created_at":"2026-05-17T23:58:55.256130+00:00","updated_at":"2026-05-17T23:58:55.256130+00:00"}