{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2018:46A36QVGBWQPENKJR6VF7NBK4U","short_pith_number":"pith:46A36QVG","canonical_record":{"source":{"id":"1811.07350","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-11-18T16:48:40Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"14f0bb416bc85341c72d508fb6e21b10ce07ed691c2f17dc9300ad1e054a0812","abstract_canon_sha256":"604ed101a82451f49c9d3986d93a6e61399ef265ae290e1ca6dc66e0327f710b"},"schema_version":"1.0"},"canonical_sha256":"e781bf42a60da0f235498faa5fb42ae51a4725ba3d0e355bdfdfc4afaa04b76d","source":{"kind":"arxiv","id":"1811.07350","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1811.07350","created_at":"2026-05-18T00:00:28Z"},{"alias_kind":"arxiv_version","alias_value":"1811.07350v1","created_at":"2026-05-18T00:00:28Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1811.07350","created_at":"2026-05-18T00:00:28Z"},{"alias_kind":"pith_short_12","alias_value":"46A36QVGBWQP","created_at":"2026-05-18T12:32:05Z"},{"alias_kind":"pith_short_16","alias_value":"46A36QVGBWQPENKJ","created_at":"2026-05-18T12:32:05Z"},{"alias_kind":"pith_short_8","alias_value":"46A36QVG","created_at":"2026-05-18T12:32:05Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2018:46A36QVGBWQPENKJR6VF7NBK4U","target":"record","payload":{"canonical_record":{"source":{"id":"1811.07350","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-11-18T16:48:40Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"14f0bb416bc85341c72d508fb6e21b10ce07ed691c2f17dc9300ad1e054a0812","abstract_canon_sha256":"604ed101a82451f49c9d3986d93a6e61399ef265ae290e1ca6dc66e0327f710b"},"schema_version":"1.0"},"canonical_sha256":"e781bf42a60da0f235498faa5fb42ae51a4725ba3d0e355bdfdfc4afaa04b76d","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:00:28.188510Z","signature_b64":"6C39o+Z22gRjjgbEQPQUvKdJVsUF0fBxTX8uj7ThVaqe2hgmUlVEyfmgeVcxlKchDF38jkLBmcAx8rE8lC5bAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e781bf42a60da0f235498faa5fb42ae51a4725ba3d0e355bdfdfc4afaa04b76d","last_reissued_at":"2026-05-18T00:00:28.187866Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:00:28.187866Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1811.07350","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:00:28Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"eqKcGo0VP/6QEKdnLM5rZEHw4a0gz1dNYGNRETGjrBhIP2Y2luo1QA2Qx4/CEmIditqMN9VqrHaYHAgMQwgJBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T11:51:26.892443Z"},"content_sha256":"8b305a62bb03dab23136a11952bd613e11efbe0faa916430298f96351ca58565","schema_version":"1.0","event_id":"sha256:8b305a62bb03dab23136a11952bd613e11efbe0faa916430298f96351ca58565"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2018:46A36QVGBWQPENKJR6VF7NBK4U","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Policy Optimization with Model-based Explorations","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.LG","authors_text":"An-xiang Zeng, Chun-Xiang Pan, Feiyang Pan, Hualin He, Pingzhong Tang, Qing Da, Qing He, Qingpeng Cai","submitted_at":"2018-11-18T16:48:40Z","abstract_excerpt":"Model-free reinforcement learning methods such as the Proximal Policy Optimization algorithm (PPO) have successfully applied in complex decision-making problems such as Atari games. However, these methods suffer from high variances and high sample complexity. On the other hand, model-based reinforcement learning methods that learn the transition dynamics are more sample efficient, but they often suffer from the bias of the transition estimation. How to make use of both model-based and model-free learning is a central problem in reinforcement learning. In this paper, we present a new technique "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1811.07350","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:00:28Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"7xg6UFRxtGq21jVfX4FDrYO4CYmn3V5ZxnzFO1bpHx6Hlmpvef3DYkafS9pCjyFqNybxe7fg8ecHUoYh5nfrBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T11:51:26.892793Z"},"content_sha256":"f3121de5274edeaf834cfc991c99b2cf921ee1c296b8da958e80ca34fffeed8a","schema_version":"1.0","event_id":"sha256:f3121de5274edeaf834cfc991c99b2cf921ee1c296b8da958e80ca34fffeed8a"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/46A36QVGBWQPENKJR6VF7NBK4U/bundle.json","state_url":"https://pith.science/pith/46A36QVGBWQPENKJR6VF7NBK4U/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/46A36QVGBWQPENKJR6VF7NBK4U/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-05T11:51:26Z","links":{"resolver":"https://pith.science/pith/46A36QVGBWQPENKJR6VF7NBK4U","bundle":"https://pith.science/pith/46A36QVGBWQPENKJR6VF7NBK4U/bundle.json","state":"https://pith.science/pith/46A36QVGBWQPENKJR6VF7NBK4U/state.json","well_known_bundle":"https://pith.science/.well-known/pith/46A36QVGBWQPENKJR6VF7NBK4U/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2018:46A36QVGBWQPENKJR6VF7NBK4U","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"604ed101a82451f49c9d3986d93a6e61399ef265ae290e1ca6dc66e0327f710b","cross_cats_sorted":["stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-11-18T16:48:40Z","title_canon_sha256":"14f0bb416bc85341c72d508fb6e21b10ce07ed691c2f17dc9300ad1e054a0812"},"schema_version":"1.0","source":{"id":"1811.07350","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1811.07350","created_at":"2026-05-18T00:00:28Z"},{"alias_kind":"arxiv_version","alias_value":"1811.07350v1","created_at":"2026-05-18T00:00:28Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1811.07350","created_at":"2026-05-18T00:00:28Z"},{"alias_kind":"pith_short_12","alias_value":"46A36QVGBWQP","created_at":"2026-05-18T12:32:05Z"},{"alias_kind":"pith_short_16","alias_value":"46A36QVGBWQPENKJ","created_at":"2026-05-18T12:32:05Z"},{"alias_kind":"pith_short_8","alias_value":"46A36QVG","created_at":"2026-05-18T12:32:05Z"}],"graph_snapshots":[{"event_id":"sha256:f3121de5274edeaf834cfc991c99b2cf921ee1c296b8da958e80ca34fffeed8a","target":"graph","created_at":"2026-05-18T00:00:28Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Model-free reinforcement learning methods such as the Proximal Policy Optimization algorithm (PPO) have successfully applied in complex decision-making problems such as Atari games. However, these methods suffer from high variances and high sample complexity. On the other hand, model-based reinforcement learning methods that learn the transition dynamics are more sample efficient, but they often suffer from the bias of the transition estimation. How to make use of both model-based and model-free learning is a central problem in reinforcement learning. In this paper, we present a new technique ","authors_text":"An-xiang Zeng, Chun-Xiang Pan, Feiyang Pan, Hualin He, Pingzhong Tang, Qing Da, Qing He, Qingpeng Cai","cross_cats":["stat.ML"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-11-18T16:48:40Z","title":"Policy Optimization with Model-based Explorations"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1811.07350","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:8b305a62bb03dab23136a11952bd613e11efbe0faa916430298f96351ca58565","target":"record","created_at":"2026-05-18T00:00:28Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"604ed101a82451f49c9d3986d93a6e61399ef265ae290e1ca6dc66e0327f710b","cross_cats_sorted":["stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-11-18T16:48:40Z","title_canon_sha256":"14f0bb416bc85341c72d508fb6e21b10ce07ed691c2f17dc9300ad1e054a0812"},"schema_version":"1.0","source":{"id":"1811.07350","kind":"arxiv","version":1}},"canonical_sha256":"e781bf42a60da0f235498faa5fb42ae51a4725ba3d0e355bdfdfc4afaa04b76d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"e781bf42a60da0f235498faa5fb42ae51a4725ba3d0e355bdfdfc4afaa04b76d","first_computed_at":"2026-05-18T00:00:28.187866Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T00:00:28.187866Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"6C39o+Z22gRjjgbEQPQUvKdJVsUF0fBxTX8uj7ThVaqe2hgmUlVEyfmgeVcxlKchDF38jkLBmcAx8rE8lC5bAQ==","signature_status":"signed_v1","signed_at":"2026-05-18T00:00:28.188510Z","signed_message":"canonical_sha256_bytes"},"source_id":"1811.07350","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:8b305a62bb03dab23136a11952bd613e11efbe0faa916430298f96351ca58565","sha256:f3121de5274edeaf834cfc991c99b2cf921ee1c296b8da958e80ca34fffeed8a"],"state_sha256":"ef266f2de35710eeb7ee023330fee8b3a5e9d6148b07117de13d85067c7db21a"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"UEUnMvzlATDw/AXflQp+/x1biDTd6PTLMxGElnVryZA7rigpSnj8WhGjqpMZYq1/8S7L5ReoCi8EUcXJ+RSaAA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-05T11:51:26.894789Z","bundle_sha256":"627d11a6cbeda8f3faf9bd03a745fcfc29d6f4ce3215dc1c0d78f5b753669bb1"}}