{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:MUNRGAUKXDN3AS6PILEHIIXO5Y","short_pith_number":"pith:MUNRGAUK","canonical_record":{"source":{"id":"2603.22273","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-23T17:56:52Z","cross_cats_sorted":[],"title_canon_sha256":"0bbc81c192dee09c25af7997247b6ad8a825e9fbb458ac5d2b94559859087437","abstract_canon_sha256":"75c8a6e40ae4c65700b5c11715334b9e7e52624b7f1857693e421012fc5ce241"},"schema_version":"1.0"},"canonical_sha256":"651b13028ab8dbb04bcf42c87422eeee30bace98d09b6ca7a3b140b601a51de6","source":{"kind":"arxiv","id":"2603.22273","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.22273","created_at":"2026-05-18T02:44:30Z"},{"alias_kind":"arxiv_version","alias_value":"2603.22273v4","created_at":"2026-05-18T02:44:30Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.22273","created_at":"2026-05-18T02:44:30Z"},{"alias_kind":"pith_short_12","alias_value":"MUNRGAUKXDN3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"MUNRGAUKXDN3AS6P","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"MUNRGAUK","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:MUNRGAUKXDN3AS6PILEHIIXO5Y","target":"record","payload":{"canonical_record":{"source":{"id":"2603.22273","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-23T17:56:52Z","cross_cats_sorted":[],"title_canon_sha256":"0bbc81c192dee09c25af7997247b6ad8a825e9fbb458ac5d2b94559859087437","abstract_canon_sha256":"75c8a6e40ae4c65700b5c11715334b9e7e52624b7f1857693e421012fc5ce241"},"schema_version":"1.0"},"canonical_sha256":"651b13028ab8dbb04bcf42c87422eeee30bace98d09b6ca7a3b140b601a51de6","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:44:30.762111Z","signature_b64":"MhViqjvAMTRh+JBxYZKjFrtdxtcSn/ev6OIOIekfVcR83iRTOHDLoZLD7fezgRFzYn7xwOj45Wk+UWlHhi0EBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"651b13028ab8dbb04bcf42c87422eeee30bace98d09b6ca7a3b140b601a51de6","last_reissued_at":"2026-05-18T02:44:30.761130Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:44:30.761130Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2603.22273","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:44:30Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ZWziB3m5DUMGlsMIiF+yOFEgxKpJp9J8xejRGZNLUqHW1SOQi+Izl0DpAjjwStlZAbJ2y5x1550/xPbf2AwYCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-31T21:51:24.577979Z"},"content_sha256":"44d8de9d47c72bfb39240bf9ad93c23a2832a2ec39ba858de7a6814d9c0c54dc","schema_version":"1.0","event_id":"sha256:44d8de9d47c72bfb39240bf9ad93c23a2832a2ec39ba858de7a6814d9c0c54dc"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:MUNRGAUKXDN3AS6PILEHIIXO5Y","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Decoupling Exploration and Policy Optimization: Uncertainty Guided Tree Search for Hard Exploration","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Uncertainty-guided tree search decouples exploration from policy optimization to reach SOTA on hard RL benchmarks.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"James Cohan, Zakaria Mhammedi","submitted_at":"2026-03-23T17:56:52Z","abstract_excerpt":"The process of discovery requires active exploration -- the act of collecting new and informative data. However, efficient autonomous exploration remains a major unsolved problem. The dominant paradigm addresses this challenge by using Reinforcement Learning (RL) to train agents with intrinsic motivation, maximizing a composite objective of extrinsic and intrinsic rewards. We suggest that this approach incurs unnecessary overhead: while policy optimization is necessary for precise task execution, employing such machinery solely to expand state coverage may be inefficient. In this paper, we pro"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"By removing the overhead of policy optimization, our approach explores an order of magnitude more efficiently than standard intrinsic motivation baselines on hard exploration benchmarks. ... achieving state-of-the-art performance by a wide margin on Montezuma's Revenge, Pitfall!, and Venture without relying on domain-specific knowledge. ... solving the MuJoCo Adroit dexterous manipulation and AntMaze tasks in a sparse-reward setting, directly from image observations and without expert demonstrations or offline datasets.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That an uncertainty measure paired with Go-With-The-Winner-style tree search will systematically expand state coverage in hard exploration domains without the policy optimization step, and that the resulting trajectories can be reliably distilled into high-performing policies using existing supervised backward learning algorithms.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Uncertainty-guided tree search decouples exploration from RL policy optimization, achieving order-of-magnitude better efficiency and SOTA performance on sparse-reward tasks like Montezuma's Revenge, Pitfall, and Venture via trajectory distillation.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Uncertainty-guided tree search decouples exploration from policy optimization to reach SOTA on hard RL benchmarks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"ffd20b98b42349f21d1bdbe122ffb09ffa9b2ec1f9508bc22410a3c2cacd0ada"},"source":{"id":"2603.22273","kind":"arxiv","version":4},"verdict":{"id":"30a17c29-74a6-4588-abc3-4c13a76975d2","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T00:17:00.319794Z","strongest_claim":"By removing the overhead of policy optimization, our approach explores an order of magnitude more efficiently than standard intrinsic motivation baselines on hard exploration benchmarks. ... achieving state-of-the-art performance by a wide margin on Montezuma's Revenge, Pitfall!, and Venture without relying on domain-specific knowledge. ... solving the MuJoCo Adroit dexterous manipulation and AntMaze tasks in a sparse-reward setting, directly from image observations and without expert demonstrations or offline datasets.","one_line_summary":"Uncertainty-guided tree search decouples exploration from RL policy optimization, achieving order-of-magnitude better efficiency and SOTA performance on sparse-reward tasks like Montezuma's Revenge, Pitfall, and Venture via trajectory distillation.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That an uncertainty measure paired with Go-With-The-Winner-style tree search will systematically expand state coverage in hard exploration domains without the policy optimization step, and that the resulting trajectories can be reliably distilled into high-performing policies using existing supervised backward learning algorithms.","pith_extraction_headline":"Uncertainty-guided tree search decouples exploration from policy optimization to reach SOTA on hard RL benchmarks."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"30a17c29-74a6-4588-abc3-4c13a76975d2"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:44:30Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"/WevMPosHoHusc9p1vhyM61oVtS75+Bo4gebwp98gC7CMqGnzh8MoVwWIkGvJTep3rvtvCLqBTyE7t8ceqh5Dg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-31T21:51:24.578896Z"},"content_sha256":"64be5eaa04987aea30f8011d519e790cf14f10a94f3483361ccd848d57cc39cc","schema_version":"1.0","event_id":"sha256:64be5eaa04987aea30f8011d519e790cf14f10a94f3483361ccd848d57cc39cc"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/MUNRGAUKXDN3AS6PILEHIIXO5Y/bundle.json","state_url":"https://pith.science/pith/MUNRGAUKXDN3AS6PILEHIIXO5Y/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/MUNRGAUKXDN3AS6PILEHIIXO5Y/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-31T21:51:24Z","links":{"resolver":"https://pith.science/pith/MUNRGAUKXDN3AS6PILEHIIXO5Y","bundle":"https://pith.science/pith/MUNRGAUKXDN3AS6PILEHIIXO5Y/bundle.json","state":"https://pith.science/pith/MUNRGAUKXDN3AS6PILEHIIXO5Y/state.json","well_known_bundle":"https://pith.science/.well-known/pith/MUNRGAUKXDN3AS6PILEHIIXO5Y/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:MUNRGAUKXDN3AS6PILEHIIXO5Y","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"75c8a6e40ae4c65700b5c11715334b9e7e52624b7f1857693e421012fc5ce241","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-23T17:56:52Z","title_canon_sha256":"0bbc81c192dee09c25af7997247b6ad8a825e9fbb458ac5d2b94559859087437"},"schema_version":"1.0","source":{"id":"2603.22273","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.22273","created_at":"2026-05-18T02:44:30Z"},{"alias_kind":"arxiv_version","alias_value":"2603.22273v4","created_at":"2026-05-18T02:44:30Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.22273","created_at":"2026-05-18T02:44:30Z"},{"alias_kind":"pith_short_12","alias_value":"MUNRGAUKXDN3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"MUNRGAUKXDN3AS6P","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"MUNRGAUK","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:64be5eaa04987aea30f8011d519e790cf14f10a94f3483361ccd848d57cc39cc","target":"graph","created_at":"2026-05-18T02:44:30Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"By removing the overhead of policy optimization, our approach explores an order of magnitude more efficiently than standard intrinsic motivation baselines on hard exploration benchmarks. ... achieving state-of-the-art performance by a wide margin on Montezuma's Revenge, Pitfall!, and Venture without relying on domain-specific knowledge. ... solving the MuJoCo Adroit dexterous manipulation and AntMaze tasks in a sparse-reward setting, directly from image observations and without expert demonstrations or offline datasets."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That an uncertainty measure paired with Go-With-The-Winner-style tree search will systematically expand state coverage in hard exploration domains without the policy optimization step, and that the resulting trajectories can be reliably distilled into high-performing policies using existing supervised backward learning algorithms."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Uncertainty-guided tree search decouples exploration from RL policy optimization, achieving order-of-magnitude better efficiency and SOTA performance on sparse-reward tasks like Montezuma's Revenge, Pitfall, and Venture via trajectory distillation."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Uncertainty-guided tree search decouples exploration from policy optimization to reach SOTA on hard RL benchmarks."}],"snapshot_sha256":"ffd20b98b42349f21d1bdbe122ffb09ffa9b2ec1f9508bc22410a3c2cacd0ada"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"The process of discovery requires active exploration -- the act of collecting new and informative data. However, efficient autonomous exploration remains a major unsolved problem. The dominant paradigm addresses this challenge by using Reinforcement Learning (RL) to train agents with intrinsic motivation, maximizing a composite objective of extrinsic and intrinsic rewards. We suggest that this approach incurs unnecessary overhead: while policy optimization is necessary for precise task execution, employing such machinery solely to expand state coverage may be inefficient. In this paper, we pro","authors_text":"James Cohan, Zakaria Mhammedi","cross_cats":[],"headline":"Uncertainty-guided tree search decouples exploration from policy optimization to reach SOTA on hard RL benchmarks.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-23T17:56:52Z","title":"Decoupling Exploration and Policy Optimization: Uncertainty Guided Tree Search for Hard Exploration"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.22273","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-15T00:17:00.319794Z","id":"30a17c29-74a6-4588-abc3-4c13a76975d2","model_set":{"reader":"grok-4.3"},"one_line_summary":"Uncertainty-guided tree search decouples exploration from RL policy optimization, achieving order-of-magnitude better efficiency and SOTA performance on sparse-reward tasks like Montezuma's Revenge, Pitfall, and Venture via trajectory distillation.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Uncertainty-guided tree search decouples exploration from policy optimization to reach SOTA on hard RL benchmarks.","strongest_claim":"By removing the overhead of policy optimization, our approach explores an order of magnitude more efficiently than standard intrinsic motivation baselines on hard exploration benchmarks. ... achieving state-of-the-art performance by a wide margin on Montezuma's Revenge, Pitfall!, and Venture without relying on domain-specific knowledge. ... solving the MuJoCo Adroit dexterous manipulation and AntMaze tasks in a sparse-reward setting, directly from image observations and without expert demonstrations or offline datasets.","weakest_assumption":"That an uncertainty measure paired with Go-With-The-Winner-style tree search will systematically expand state coverage in hard exploration domains without the policy optimization step, and that the resulting trajectories can be reliably distilled into high-performing policies using existing supervised backward learning algorithms."}},"verdict_id":"30a17c29-74a6-4588-abc3-4c13a76975d2"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:44d8de9d47c72bfb39240bf9ad93c23a2832a2ec39ba858de7a6814d9c0c54dc","target":"record","created_at":"2026-05-18T02:44:30Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"75c8a6e40ae4c65700b5c11715334b9e7e52624b7f1857693e421012fc5ce241","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-23T17:56:52Z","title_canon_sha256":"0bbc81c192dee09c25af7997247b6ad8a825e9fbb458ac5d2b94559859087437"},"schema_version":"1.0","source":{"id":"2603.22273","kind":"arxiv","version":4}},"canonical_sha256":"651b13028ab8dbb04bcf42c87422eeee30bace98d09b6ca7a3b140b601a51de6","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"651b13028ab8dbb04bcf42c87422eeee30bace98d09b6ca7a3b140b601a51de6","first_computed_at":"2026-05-18T02:44:30.761130Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:44:30.761130Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"MhViqjvAMTRh+JBxYZKjFrtdxtcSn/ev6OIOIekfVcR83iRTOHDLoZLD7fezgRFzYn7xwOj45Wk+UWlHhi0EBw==","signature_status":"signed_v1","signed_at":"2026-05-18T02:44:30.762111Z","signed_message":"canonical_sha256_bytes"},"source_id":"2603.22273","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:44d8de9d47c72bfb39240bf9ad93c23a2832a2ec39ba858de7a6814d9c0c54dc","sha256:64be5eaa04987aea30f8011d519e790cf14f10a94f3483361ccd848d57cc39cc"],"state_sha256":"c5c6a497075a1c9c8e70466f585a86a2041b328e5f2ab1cb641a84ec8cc1791b"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"3cKe6lLsvu1L4+AzmBwv3azf33NH7/kAHqIDIhhW/CNgp2MEu9R1VWAUXG3UXjrN1uz9igXOOVeIcR3hsD36Dw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-31T21:51:24.582977Z","bundle_sha256":"9b041f0778b9a729d2deaa90733873c194df762c9c7bafa3e5729dd6b5ba0d4e"}}