{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2018:UAXZQNFHJR5FBXNSMCYSPEJNKL","short_pith_number":"pith:UAXZQNFH","canonical_record":{"source":{"id":"1806.06920","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-06-14T12:46:23Z","cross_cats_sorted":["cs.AI","cs.IT","cs.RO","math.IT","stat.ML"],"title_canon_sha256":"eccde276c3f8de0e1b551d861c94e59d7836421b50ccd048e66805a67671b88e","abstract_canon_sha256":"80ef65e0114b46cbcd1411b8c1bcd125aef9f7e5ba657e9e84db07dc6e66f65a"},"schema_version":"1.0"},"canonical_sha256":"a02f9834a74c7a50ddb260b127912d52c5a6ed30ac329fc99226664ba943c29f","source":{"kind":"arxiv","id":"1806.06920","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1806.06920","created_at":"2026-05-18T00:12:37Z"},{"alias_kind":"arxiv_version","alias_value":"1806.06920v1","created_at":"2026-05-18T00:12:37Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1806.06920","created_at":"2026-05-18T00:12:37Z"},{"alias_kind":"pith_short_12","alias_value":"UAXZQNFHJR5F","created_at":"2026-05-18T12:32:56Z"},{"alias_kind":"pith_short_16","alias_value":"UAXZQNFHJR5FBXNS","created_at":"2026-05-18T12:32:56Z"},{"alias_kind":"pith_short_8","alias_value":"UAXZQNFH","created_at":"2026-05-18T12:32:56Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2018:UAXZQNFHJR5FBXNSMCYSPEJNKL","target":"record","payload":{"canonical_record":{"source":{"id":"1806.06920","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-06-14T12:46:23Z","cross_cats_sorted":["cs.AI","cs.IT","cs.RO","math.IT","stat.ML"],"title_canon_sha256":"eccde276c3f8de0e1b551d861c94e59d7836421b50ccd048e66805a67671b88e","abstract_canon_sha256":"80ef65e0114b46cbcd1411b8c1bcd125aef9f7e5ba657e9e84db07dc6e66f65a"},"schema_version":"1.0"},"canonical_sha256":"a02f9834a74c7a50ddb260b127912d52c5a6ed30ac329fc99226664ba943c29f","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:12:37.726177Z","signature_b64":"1WfTboq4Z5Ahm7CCgEFLa2XpC2CAYBpNDOn3bk0J16XZHfJbarLQcKAm8l+V/7NSOrbyHhVBxIIZ108M7Ta/Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a02f9834a74c7a50ddb260b127912d52c5a6ed30ac329fc99226664ba943c29f","last_reissued_at":"2026-05-18T00:12:37.725340Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:12:37.725340Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1806.06920","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:12:37Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"1da+jtU+/891tRgMs6vofV/dHgJ57LXqNYyNJWBphIdp0nHU0W9pkxA2Z07tiACuNGg+GzOujMadfVwZTfg2BQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T08:55:35.853345Z"},"content_sha256":"96698dd0c867fccbd05c7ee28cebc1c10b47bc2901b5548b88a396bbae645fe6","schema_version":"1.0","event_id":"sha256:96698dd0c867fccbd05c7ee28cebc1c10b47bc2901b5548b88a396bbae645fe6"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2018:UAXZQNFHJR5FBXNSMCYSPEJNKL","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Maximum a Posteriori Policy Optimisation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.IT","cs.RO","math.IT","stat.ML"],"primary_cat":"cs.LG","authors_text":"Abbas Abdolmaleki, Jost Tobias Springenberg, Martin Riedmiller, Nicolas Heess, Remi Munos, Yuval Tassa","submitted_at":"2018-06-14T12:46:23Z","abstract_excerpt":"We introduce a new algorithm for reinforcement learning called Maximum aposteriori Policy Optimisation (MPO) based on coordinate ascent on a relative entropy objective. We show that several existing methods can directly be related to our derivation. We develop two off-policy algorithms and demonstrate that they are competitive with the state-of-the-art in deep reinforcement learning. In particular, for continuous control, our method outperforms existing methods with respect to sample efficiency, premature convergence and robustness to hyperparameter settings while achieving similar or better f"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1806.06920","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:12:37Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"xpFiL/qoOdL/oAhgAwnNRhc0LjNzDoekw5Uot4Txcqycwo3UjWXaw37gMxIfum0PRrU8t/vc38WYLj0AIjoPBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T08:55:35.854057Z"},"content_sha256":"dfe20edf34960c43f66146ba2b529708c7ef7f2f385d08d8d77bf32e9725506b","schema_version":"1.0","event_id":"sha256:dfe20edf34960c43f66146ba2b529708c7ef7f2f385d08d8d77bf32e9725506b"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/UAXZQNFHJR5FBXNSMCYSPEJNKL/bundle.json","state_url":"https://pith.science/pith/UAXZQNFHJR5FBXNSMCYSPEJNKL/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/UAXZQNFHJR5FBXNSMCYSPEJNKL/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-09T08:55:35Z","links":{"resolver":"https://pith.science/pith/UAXZQNFHJR5FBXNSMCYSPEJNKL","bundle":"https://pith.science/pith/UAXZQNFHJR5FBXNSMCYSPEJNKL/bundle.json","state":"https://pith.science/pith/UAXZQNFHJR5FBXNSMCYSPEJNKL/state.json","well_known_bundle":"https://pith.science/.well-known/pith/UAXZQNFHJR5FBXNSMCYSPEJNKL/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2018:UAXZQNFHJR5FBXNSMCYSPEJNKL","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"80ef65e0114b46cbcd1411b8c1bcd125aef9f7e5ba657e9e84db07dc6e66f65a","cross_cats_sorted":["cs.AI","cs.IT","cs.RO","math.IT","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-06-14T12:46:23Z","title_canon_sha256":"eccde276c3f8de0e1b551d861c94e59d7836421b50ccd048e66805a67671b88e"},"schema_version":"1.0","source":{"id":"1806.06920","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1806.06920","created_at":"2026-05-18T00:12:37Z"},{"alias_kind":"arxiv_version","alias_value":"1806.06920v1","created_at":"2026-05-18T00:12:37Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1806.06920","created_at":"2026-05-18T00:12:37Z"},{"alias_kind":"pith_short_12","alias_value":"UAXZQNFHJR5F","created_at":"2026-05-18T12:32:56Z"},{"alias_kind":"pith_short_16","alias_value":"UAXZQNFHJR5FBXNS","created_at":"2026-05-18T12:32:56Z"},{"alias_kind":"pith_short_8","alias_value":"UAXZQNFH","created_at":"2026-05-18T12:32:56Z"}],"graph_snapshots":[{"event_id":"sha256:dfe20edf34960c43f66146ba2b529708c7ef7f2f385d08d8d77bf32e9725506b","target":"graph","created_at":"2026-05-18T00:12:37Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"We introduce a new algorithm for reinforcement learning called Maximum aposteriori Policy Optimisation (MPO) based on coordinate ascent on a relative entropy objective. We show that several existing methods can directly be related to our derivation. We develop two off-policy algorithms and demonstrate that they are competitive with the state-of-the-art in deep reinforcement learning. In particular, for continuous control, our method outperforms existing methods with respect to sample efficiency, premature convergence and robustness to hyperparameter settings while achieving similar or better f","authors_text":"Abbas Abdolmaleki, Jost Tobias Springenberg, Martin Riedmiller, Nicolas Heess, Remi Munos, Yuval Tassa","cross_cats":["cs.AI","cs.IT","cs.RO","math.IT","stat.ML"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-06-14T12:46:23Z","title":"Maximum a Posteriori Policy Optimisation"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1806.06920","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:96698dd0c867fccbd05c7ee28cebc1c10b47bc2901b5548b88a396bbae645fe6","target":"record","created_at":"2026-05-18T00:12:37Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"80ef65e0114b46cbcd1411b8c1bcd125aef9f7e5ba657e9e84db07dc6e66f65a","cross_cats_sorted":["cs.AI","cs.IT","cs.RO","math.IT","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-06-14T12:46:23Z","title_canon_sha256":"eccde276c3f8de0e1b551d861c94e59d7836421b50ccd048e66805a67671b88e"},"schema_version":"1.0","source":{"id":"1806.06920","kind":"arxiv","version":1}},"canonical_sha256":"a02f9834a74c7a50ddb260b127912d52c5a6ed30ac329fc99226664ba943c29f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"a02f9834a74c7a50ddb260b127912d52c5a6ed30ac329fc99226664ba943c29f","first_computed_at":"2026-05-18T00:12:37.725340Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T00:12:37.725340Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"1WfTboq4Z5Ahm7CCgEFLa2XpC2CAYBpNDOn3bk0J16XZHfJbarLQcKAm8l+V/7NSOrbyHhVBxIIZ108M7Ta/Aw==","signature_status":"signed_v1","signed_at":"2026-05-18T00:12:37.726177Z","signed_message":"canonical_sha256_bytes"},"source_id":"1806.06920","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:96698dd0c867fccbd05c7ee28cebc1c10b47bc2901b5548b88a396bbae645fe6","sha256:dfe20edf34960c43f66146ba2b529708c7ef7f2f385d08d8d77bf32e9725506b"],"state_sha256":"ae599d34eae16b59cfc67e830f7dd8700de823bf40abfad044bd45bab5642e25"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"AeiYcJxKDP6S7tL9T6wLMdosHH84KfohsjoUe5v1kUcIL7FqDiDeLB4bX/XQs8uWYXH/dkY/K16K7GLznxPZAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-09T08:55:35.858434Z","bundle_sha256":"334071a033aaf8da784ae3baec60aa5f403c3c1cd0efba77f28541329e8fa285"}}