{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2018:Y74VH3AYFC3N6ZGAT7CKY6P5G3","short_pith_number":"pith:Y74VH3AY","canonical_record":{"source":{"id":"1812.01552","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"math.OC","submitted_at":"2018-12-04T17:46:06Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"d9c1b2f818f522654429252a2ae675f7f0b932d38f965f392ecdefce3df05614","abstract_canon_sha256":"a6c08c244822bf9c2bd18d84c58eed782cdadf094802d3132b1a2d7bb3d0ebf1"},"schema_version":"1.0"},"canonical_sha256":"c7f953ec1828b6df64c09fc4ac79fd36dcabea65098c32a31940c5b2bce1f516","source":{"kind":"arxiv","id":"1812.01552","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1812.01552","created_at":"2026-05-17T23:54:06Z"},{"alias_kind":"arxiv_version","alias_value":"1812.01552v3","created_at":"2026-05-17T23:54:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1812.01552","created_at":"2026-05-17T23:54:06Z"},{"alias_kind":"pith_short_12","alias_value":"Y74VH3AYFC3N","created_at":"2026-05-18T12:33:04Z"},{"alias_kind":"pith_short_16","alias_value":"Y74VH3AYFC3N6ZGA","created_at":"2026-05-18T12:33:04Z"},{"alias_kind":"pith_short_8","alias_value":"Y74VH3AY","created_at":"2026-05-18T12:33:04Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2018:Y74VH3AYFC3N6ZGAT7CKY6P5G3","target":"record","payload":{"canonical_record":{"source":{"id":"1812.01552","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"math.OC","submitted_at":"2018-12-04T17:46:06Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"d9c1b2f818f522654429252a2ae675f7f0b932d38f965f392ecdefce3df05614","abstract_canon_sha256":"a6c08c244822bf9c2bd18d84c58eed782cdadf094802d3132b1a2d7bb3d0ebf1"},"schema_version":"1.0"},"canonical_sha256":"c7f953ec1828b6df64c09fc4ac79fd36dcabea65098c32a31940c5b2bce1f516","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:54:06.951455Z","signature_b64":"ypFs2a7UZYz3QedD3GXAJQzoWzaOXuro3ngosyiL9WJAgFkH1Y90cmql4RZJE+IFnPoygv5RIXakXsx59n3hAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c7f953ec1828b6df64c09fc4ac79fd36dcabea65098c32a31940c5b2bce1f516","last_reissued_at":"2026-05-17T23:54:06.950994Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:54:06.950994Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1812.01552","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:54:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"SNLRLBCbOlRN3JCxOPfy7FIszIAt4r7uA4FlMizAg1WLKunQGNTshNzJklLAw5a2dxFyrQVHYF2yB1EGBboVAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T01:19:10.036499Z"},"content_sha256":"c89b9f7003d1a01955ff8036fe13dc6dd710a4fb397c98a4d1a2dbcaca763737","schema_version":"1.0","event_id":"sha256:c89b9f7003d1a01955ff8036fe13dc6dd710a4fb397c98a4d1a2dbcaca763737"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2018:Y74VH3AYFC3N6ZGAT7CKY6P5G3","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Exploration versus exploitation in reinforcement learning: a stochastic control approach","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"math.OC","authors_text":"Haoran Wang, Thaleia Zariphopoulou, Xunyu Zhou","submitted_at":"2018-12-04T17:46:06Z","abstract_excerpt":"We consider reinforcement learning (RL) in continuous time and study the problem of achieving the best trade-off between exploration of a black box environment and exploitation of current knowledge. We propose an entropy-regularized reward function involving the differential entropy of the distributions of actions, and motivate and devise an exploratory formulation for the feature dynamics that captures repetitive learning under exploration. The resulting optimization problem is a revitalization of the classical relaxed stochastic control. We carry out a complete analysis of the problem in the"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1812.01552","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:54:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"U3mlPTFsDdRDsaNQjufrFRtCHYHt8ZNzHhiTx7qBMqoBZQzP232+seJ3mR8NZPjd239nr1EOTeLVumnhYLRgAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T01:19:10.036848Z"},"content_sha256":"26f0e426f8458c0fa1110f43c89ca8c497515e77c333ee0c2aa58c4ec11d73c9","schema_version":"1.0","event_id":"sha256:26f0e426f8458c0fa1110f43c89ca8c497515e77c333ee0c2aa58c4ec11d73c9"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/Y74VH3AYFC3N6ZGAT7CKY6P5G3/bundle.json","state_url":"https://pith.science/pith/Y74VH3AYFC3N6ZGAT7CKY6P5G3/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/Y74VH3AYFC3N6ZGAT7CKY6P5G3/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T01:19:10Z","links":{"resolver":"https://pith.science/pith/Y74VH3AYFC3N6ZGAT7CKY6P5G3","bundle":"https://pith.science/pith/Y74VH3AYFC3N6ZGAT7CKY6P5G3/bundle.json","state":"https://pith.science/pith/Y74VH3AYFC3N6ZGAT7CKY6P5G3/state.json","well_known_bundle":"https://pith.science/.well-known/pith/Y74VH3AYFC3N6ZGAT7CKY6P5G3/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2018:Y74VH3AYFC3N6ZGAT7CKY6P5G3","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"a6c08c244822bf9c2bd18d84c58eed782cdadf094802d3132b1a2d7bb3d0ebf1","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"math.OC","submitted_at":"2018-12-04T17:46:06Z","title_canon_sha256":"d9c1b2f818f522654429252a2ae675f7f0b932d38f965f392ecdefce3df05614"},"schema_version":"1.0","source":{"id":"1812.01552","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1812.01552","created_at":"2026-05-17T23:54:06Z"},{"alias_kind":"arxiv_version","alias_value":"1812.01552v3","created_at":"2026-05-17T23:54:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1812.01552","created_at":"2026-05-17T23:54:06Z"},{"alias_kind":"pith_short_12","alias_value":"Y74VH3AYFC3N","created_at":"2026-05-18T12:33:04Z"},{"alias_kind":"pith_short_16","alias_value":"Y74VH3AYFC3N6ZGA","created_at":"2026-05-18T12:33:04Z"},{"alias_kind":"pith_short_8","alias_value":"Y74VH3AY","created_at":"2026-05-18T12:33:04Z"}],"graph_snapshots":[{"event_id":"sha256:26f0e426f8458c0fa1110f43c89ca8c497515e77c333ee0c2aa58c4ec11d73c9","target":"graph","created_at":"2026-05-17T23:54:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"We consider reinforcement learning (RL) in continuous time and study the problem of achieving the best trade-off between exploration of a black box environment and exploitation of current knowledge. We propose an entropy-regularized reward function involving the differential entropy of the distributions of actions, and motivate and devise an exploratory formulation for the feature dynamics that captures repetitive learning under exploration. The resulting optimization problem is a revitalization of the classical relaxed stochastic control. We carry out a complete analysis of the problem in the","authors_text":"Haoran Wang, Thaleia Zariphopoulou, Xunyu Zhou","cross_cats":["cs.LG"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"math.OC","submitted_at":"2018-12-04T17:46:06Z","title":"Exploration versus exploitation in reinforcement learning: a stochastic control approach"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1812.01552","kind":"arxiv","version":3},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:c89b9f7003d1a01955ff8036fe13dc6dd710a4fb397c98a4d1a2dbcaca763737","target":"record","created_at":"2026-05-17T23:54:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"a6c08c244822bf9c2bd18d84c58eed782cdadf094802d3132b1a2d7bb3d0ebf1","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"math.OC","submitted_at":"2018-12-04T17:46:06Z","title_canon_sha256":"d9c1b2f818f522654429252a2ae675f7f0b932d38f965f392ecdefce3df05614"},"schema_version":"1.0","source":{"id":"1812.01552","kind":"arxiv","version":3}},"canonical_sha256":"c7f953ec1828b6df64c09fc4ac79fd36dcabea65098c32a31940c5b2bce1f516","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c7f953ec1828b6df64c09fc4ac79fd36dcabea65098c32a31940c5b2bce1f516","first_computed_at":"2026-05-17T23:54:06.950994Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:54:06.950994Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"ypFs2a7UZYz3QedD3GXAJQzoWzaOXuro3ngosyiL9WJAgFkH1Y90cmql4RZJE+IFnPoygv5RIXakXsx59n3hAw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:54:06.951455Z","signed_message":"canonical_sha256_bytes"},"source_id":"1812.01552","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:c89b9f7003d1a01955ff8036fe13dc6dd710a4fb397c98a4d1a2dbcaca763737","sha256:26f0e426f8458c0fa1110f43c89ca8c497515e77c333ee0c2aa58c4ec11d73c9"],"state_sha256":"f44a29efbbfe9c0ac9472a208b0a3b620445be096ffb3bab658f2e107552e044"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"IiE+WKfFgp6BxuVE2rg2hpUqpWoEpIY3011BZyzNrZdpDYEIy0JmAayR3iGFuFN0UIAIws3l5wjpX0ISqeP3Ag==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T01:19:10.038816Z","bundle_sha256":"a1b3b50d4d4f9ebda458a0e44228703a5170254a1616d2e9a1a5c1f9196e5d42"}}