{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:GPLEA2RIASRI75YOBHQ7FK27Q7","short_pith_number":"pith:GPLEA2RI","schema_version":"1.0","canonical_sha256":"33d6406a2804a28ff70e09e1f2ab5f87f84985d68e3a92fe8aa55aa7030f5c26","source":{"kind":"arxiv","id":"1811.06272","version":1},"attestation_state":"computed","paper":{"title":"Woulda, Coulda, Shoulda: Counterfactually-Guided Policy Search","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.LG","authors_text":"Arthur Guez, Jean-Baptiste Lespiau, Lars Buesing, Nicolas Heess, Sebastien Racaniere, Theophane Weber, Yori Zwols","submitted_at":"2018-11-15T10:08:58Z","abstract_excerpt":"Learning policies on data synthesized by models can in principle quench the thirst of reinforcement learning algorithms for large amounts of real experience, which is often costly to acquire. However, simulating plausible experience de novo is a hard problem for many complex environments, often resulting in biases for model-based policy evaluation and search. Instead of de novo synthesis of data, here we assume logged, real experience and model alternative outcomes of this experience under counterfactual actions, actions that were not actually taken. Based on this, we propose the Counterfactua"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1811.06272","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-11-15T10:08:58Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"d1dbca7481045412161e03ccdf4966341d6c673b4dcce6b894f99f3752e99ada","abstract_canon_sha256":"1255f28e582843d007311ce221681999cba5bc4cd165a9b3daa25bd2afd9da1f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:00:38.153906Z","signature_b64":"fr7GnKcHadsDcbp+ZPirlpiqFCM4fV2ODQOl2E64r6LDJj4K8HvS/h8MtxfERlzui8o5Qbmh/6B6tW4V1rFcAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"33d6406a2804a28ff70e09e1f2ab5f87f84985d68e3a92fe8aa55aa7030f5c26","last_reissued_at":"2026-05-18T00:00:38.153390Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:00:38.153390Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Woulda, Coulda, Shoulda: Counterfactually-Guided Policy Search","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.LG","authors_text":"Arthur Guez, Jean-Baptiste Lespiau, Lars Buesing, Nicolas Heess, Sebastien Racaniere, Theophane Weber, Yori Zwols","submitted_at":"2018-11-15T10:08:58Z","abstract_excerpt":"Learning policies on data synthesized by models can in principle quench the thirst of reinforcement learning algorithms for large amounts of real experience, which is often costly to acquire. However, simulating plausible experience de novo is a hard problem for many complex environments, often resulting in biases for model-based policy evaluation and search. Instead of de novo synthesis of data, here we assume logged, real experience and model alternative outcomes of this experience under counterfactual actions, actions that were not actually taken. Based on this, we propose the Counterfactua"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1811.06272","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1811.06272","created_at":"2026-05-18T00:00:38.153462+00:00"},{"alias_kind":"arxiv_version","alias_value":"1811.06272v1","created_at":"2026-05-18T00:00:38.153462+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1811.06272","created_at":"2026-05-18T00:00:38.153462+00:00"},{"alias_kind":"pith_short_12","alias_value":"GPLEA2RIASRI","created_at":"2026-05-18T12:32:25.280505+00:00"},{"alias_kind":"pith_short_16","alias_value":"GPLEA2RIASRI75YO","created_at":"2026-05-18T12:32:25.280505+00:00"},{"alias_kind":"pith_short_8","alias_value":"GPLEA2RI","created_at":"2026-05-18T12:32:25.280505+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2502.13731","citing_title":"Robust Counterfactual Inference in Markov Decision Processes","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2411.05174","citing_title":"Bayesian Inverse Transition Learning: Learning Dynamics From Near-Optimal Trajectories","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06066","citing_title":"Causal Reinforcement Learning for Complex Card Games: A Magic The Gathering Benchmark","ref_index":45,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/GPLEA2RIASRI75YOBHQ7FK27Q7","json":"https://pith.science/pith/GPLEA2RIASRI75YOBHQ7FK27Q7.json","graph_json":"https://pith.science/api/pith-number/GPLEA2RIASRI75YOBHQ7FK27Q7/graph.json","events_json":"https://pith.science/api/pith-number/GPLEA2RIASRI75YOBHQ7FK27Q7/events.json","paper":"https://pith.science/paper/GPLEA2RI"},"agent_actions":{"view_html":"https://pith.science/pith/GPLEA2RIASRI75YOBHQ7FK27Q7","download_json":"https://pith.science/pith/GPLEA2RIASRI75YOBHQ7FK27Q7.json","view_paper":"https://pith.science/paper/GPLEA2RI","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1811.06272&json=true","fetch_graph":"https://pith.science/api/pith-number/GPLEA2RIASRI75YOBHQ7FK27Q7/graph.json","fetch_events":"https://pith.science/api/pith-number/GPLEA2RIASRI75YOBHQ7FK27Q7/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/GPLEA2RIASRI75YOBHQ7FK27Q7/action/timestamp_anchor","attest_storage":"https://pith.science/pith/GPLEA2RIASRI75YOBHQ7FK27Q7/action/storage_attestation","attest_author":"https://pith.science/pith/GPLEA2RIASRI75YOBHQ7FK27Q7/action/author_attestation","sign_citation":"https://pith.science/pith/GPLEA2RIASRI75YOBHQ7FK27Q7/action/citation_signature","submit_replication":"https://pith.science/pith/GPLEA2RIASRI75YOBHQ7FK27Q7/action/replication_record"}},"created_at":"2026-05-18T00:00:38.153462+00:00","updated_at":"2026-05-18T00:00:38.153462+00:00"}