{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2019:GMVCIDOC7N2TRYW6WPSRWI6NRN","short_pith_number":"pith:GMVCIDOC","canonical_record":{"source":{"id":"1901.06212","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-01-18T13:09:18Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"216f4df54a356515f6be4bb692bf930f68c31feddcd3709686b2c565f141fff4","abstract_canon_sha256":"3f8f5c184dde3c63aa696147208ad7cf46f01a658424b91e23c94dbc02ccde51"},"schema_version":"1.0"},"canonical_sha256":"332a240dc2fb7538e2deb3e51b23cd8b408152d5506e2d623f7a127e3ec2309a","source":{"kind":"arxiv","id":"1901.06212","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1901.06212","created_at":"2026-05-17T23:56:04Z"},{"alias_kind":"arxiv_version","alias_value":"1901.06212v1","created_at":"2026-05-17T23:56:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1901.06212","created_at":"2026-05-17T23:56:04Z"},{"alias_kind":"pith_short_12","alias_value":"GMVCIDOC7N2T","created_at":"2026-05-18T12:33:18Z"},{"alias_kind":"pith_short_16","alias_value":"GMVCIDOC7N2TRYW6","created_at":"2026-05-18T12:33:18Z"},{"alias_kind":"pith_short_8","alias_value":"GMVCIDOC","created_at":"2026-05-18T12:33:18Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2019:GMVCIDOC7N2TRYW6WPSRWI6NRN","target":"record","payload":{"canonical_record":{"source":{"id":"1901.06212","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-01-18T13:09:18Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"216f4df54a356515f6be4bb692bf930f68c31feddcd3709686b2c565f141fff4","abstract_canon_sha256":"3f8f5c184dde3c63aa696147208ad7cf46f01a658424b91e23c94dbc02ccde51"},"schema_version":"1.0"},"canonical_sha256":"332a240dc2fb7538e2deb3e51b23cd8b408152d5506e2d623f7a127e3ec2309a","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:56:04.214200Z","signature_b64":"gTAxN5KveJD5z84FAbeQ9j6Or0Wr+EqxxMXrQKxueVfKcVsegVcj+t9hJlHZSdhoWABBOb1nvUJxlhxszwd7BA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"332a240dc2fb7538e2deb3e51b23cd8b408152d5506e2d623f7a127e3ec2309a","last_reissued_at":"2026-05-17T23:56:04.213590Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:56:04.213590Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1901.06212","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:56:04Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"+BjN8sNSW+bk3d3+WqtLxxOAUK1cT5W//M/YoEVTJ3GQlLDh6hWcd4xgyAG1y0VvAe5tehR2tIE8c2WIFmLiAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T18:03:03.731632Z"},"content_sha256":"c08a4117ae2db24f53fc7f4fd565697ce06982f9ce8aa1875ea350e1f8e1e261","schema_version":"1.0","event_id":"sha256:c08a4117ae2db24f53fc7f4fd565697ce06982f9ce8aa1875ea350e1f8e1e261"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2019:GMVCIDOC7N2TRYW6WPSRWI6NRN","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"On-Policy Trust Region Policy Optimisation with Replay Buffers","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.LG","authors_text":"Dmitry Kangin, Nicolas Pugeault","submitted_at":"2019-01-18T13:09:18Z","abstract_excerpt":"Building upon the recent success of deep reinforcement learning methods, we investigate the possibility of on-policy reinforcement learning improvement by reusing the data from several consecutive policies. On-policy methods bring many benefits, such as ability to evaluate each resulting policy. However, they usually discard all the information about the policies which existed before. In this work, we propose adaptation of the replay buffer concept, borrowed from the off-policy learning setting, to create the method, combining advantages of on- and off-policy learning. To achieve this, the pro"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1901.06212","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:56:04Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"9JRU+w605JW9VMpWrgm0rIsUKFXmGytvn+DNlqGhm9xvSMBqGJaZcAeYZ0URLeEw7S7zsgWbkdR6kg0ELjHwCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T18:03:03.732284Z"},"content_sha256":"ac045c3e96feba9d4cf2e108879c13eccc4b7c3dc85b5f6378fe2717c3222ee0","schema_version":"1.0","event_id":"sha256:ac045c3e96feba9d4cf2e108879c13eccc4b7c3dc85b5f6378fe2717c3222ee0"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/GMVCIDOC7N2TRYW6WPSRWI6NRN/bundle.json","state_url":"https://pith.science/pith/GMVCIDOC7N2TRYW6WPSRWI6NRN/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/GMVCIDOC7N2TRYW6WPSRWI6NRN/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T18:03:03Z","links":{"resolver":"https://pith.science/pith/GMVCIDOC7N2TRYW6WPSRWI6NRN","bundle":"https://pith.science/pith/GMVCIDOC7N2TRYW6WPSRWI6NRN/bundle.json","state":"https://pith.science/pith/GMVCIDOC7N2TRYW6WPSRWI6NRN/state.json","well_known_bundle":"https://pith.science/.well-known/pith/GMVCIDOC7N2TRYW6WPSRWI6NRN/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2019:GMVCIDOC7N2TRYW6WPSRWI6NRN","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"3f8f5c184dde3c63aa696147208ad7cf46f01a658424b91e23c94dbc02ccde51","cross_cats_sorted":["cs.AI","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-01-18T13:09:18Z","title_canon_sha256":"216f4df54a356515f6be4bb692bf930f68c31feddcd3709686b2c565f141fff4"},"schema_version":"1.0","source":{"id":"1901.06212","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1901.06212","created_at":"2026-05-17T23:56:04Z"},{"alias_kind":"arxiv_version","alias_value":"1901.06212v1","created_at":"2026-05-17T23:56:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1901.06212","created_at":"2026-05-17T23:56:04Z"},{"alias_kind":"pith_short_12","alias_value":"GMVCIDOC7N2T","created_at":"2026-05-18T12:33:18Z"},{"alias_kind":"pith_short_16","alias_value":"GMVCIDOC7N2TRYW6","created_at":"2026-05-18T12:33:18Z"},{"alias_kind":"pith_short_8","alias_value":"GMVCIDOC","created_at":"2026-05-18T12:33:18Z"}],"graph_snapshots":[{"event_id":"sha256:ac045c3e96feba9d4cf2e108879c13eccc4b7c3dc85b5f6378fe2717c3222ee0","target":"graph","created_at":"2026-05-17T23:56:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Building upon the recent success of deep reinforcement learning methods, we investigate the possibility of on-policy reinforcement learning improvement by reusing the data from several consecutive policies. On-policy methods bring many benefits, such as ability to evaluate each resulting policy. However, they usually discard all the information about the policies which existed before. In this work, we propose adaptation of the replay buffer concept, borrowed from the off-policy learning setting, to create the method, combining advantages of on- and off-policy learning. To achieve this, the pro","authors_text":"Dmitry Kangin, Nicolas Pugeault","cross_cats":["cs.AI","stat.ML"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-01-18T13:09:18Z","title":"On-Policy Trust Region Policy Optimisation with Replay Buffers"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1901.06212","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:c08a4117ae2db24f53fc7f4fd565697ce06982f9ce8aa1875ea350e1f8e1e261","target":"record","created_at":"2026-05-17T23:56:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"3f8f5c184dde3c63aa696147208ad7cf46f01a658424b91e23c94dbc02ccde51","cross_cats_sorted":["cs.AI","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-01-18T13:09:18Z","title_canon_sha256":"216f4df54a356515f6be4bb692bf930f68c31feddcd3709686b2c565f141fff4"},"schema_version":"1.0","source":{"id":"1901.06212","kind":"arxiv","version":1}},"canonical_sha256":"332a240dc2fb7538e2deb3e51b23cd8b408152d5506e2d623f7a127e3ec2309a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"332a240dc2fb7538e2deb3e51b23cd8b408152d5506e2d623f7a127e3ec2309a","first_computed_at":"2026-05-17T23:56:04.213590Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:56:04.213590Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"gTAxN5KveJD5z84FAbeQ9j6Or0Wr+EqxxMXrQKxueVfKcVsegVcj+t9hJlHZSdhoWABBOb1nvUJxlhxszwd7BA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:56:04.214200Z","signed_message":"canonical_sha256_bytes"},"source_id":"1901.06212","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:c08a4117ae2db24f53fc7f4fd565697ce06982f9ce8aa1875ea350e1f8e1e261","sha256:ac045c3e96feba9d4cf2e108879c13eccc4b7c3dc85b5f6378fe2717c3222ee0"],"state_sha256":"376d85505ed5bc696be6f9bee545dddc2d44efcc6d81a8da98ff4e3d8a49107c"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"wC/zGUZD6YfX6i/EYeJ1zYBUw+jQHZ9nuxM09YfZHt8E90lyNS2ZY7p4NTyL2DYJeP+KeyvspsGY0fSLTGckDw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T18:03:03.735690Z","bundle_sha256":"ec403c302a176da64665a5c00d4f5d233204ce6f8dbd62d666f7ee067dcf399d"}}