{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2017:DBSTFMLNJCBW2F4DPZ4BMRSVJH","short_pith_number":"pith:DBSTFMLN","canonical_record":{"source":{"id":"1702.08892","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2017-02-28T18:06:15Z","cross_cats_sorted":["cs.LG","stat.ML"],"title_canon_sha256":"f6d653cf0884a1d721d5514f303319b7107fe7203e73440116e23f7ad20c433f","abstract_canon_sha256":"0b848298bbeed206e6aad207ae753793ae9e4d1703a0b6dca393a1ea830631ee"},"schema_version":"1.0"},"canonical_sha256":"186532b16d48836d17837e7816465549e195064f7942fdb98f9ceca69e8d87ea","source":{"kind":"arxiv","id":"1702.08892","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1702.08892","created_at":"2026-05-18T00:29:46Z"},{"alias_kind":"arxiv_version","alias_value":"1702.08892v3","created_at":"2026-05-18T00:29:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1702.08892","created_at":"2026-05-18T00:29:46Z"},{"alias_kind":"pith_short_12","alias_value":"DBSTFMLNJCBW","created_at":"2026-05-18T12:31:10Z"},{"alias_kind":"pith_short_16","alias_value":"DBSTFMLNJCBW2F4D","created_at":"2026-05-18T12:31:10Z"},{"alias_kind":"pith_short_8","alias_value":"DBSTFMLN","created_at":"2026-05-18T12:31:10Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2017:DBSTFMLNJCBW2F4DPZ4BMRSVJH","target":"record","payload":{"canonical_record":{"source":{"id":"1702.08892","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2017-02-28T18:06:15Z","cross_cats_sorted":["cs.LG","stat.ML"],"title_canon_sha256":"f6d653cf0884a1d721d5514f303319b7107fe7203e73440116e23f7ad20c433f","abstract_canon_sha256":"0b848298bbeed206e6aad207ae753793ae9e4d1703a0b6dca393a1ea830631ee"},"schema_version":"1.0"},"canonical_sha256":"186532b16d48836d17837e7816465549e195064f7942fdb98f9ceca69e8d87ea","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:29:46.364617Z","signature_b64":"+QAO0DhCkJhoSjTd1VVe78cIkOE1CTFYBJDnWcJoiq23bqBVUEYUAox/0Q1jdNqPzGX4c6KlBXnDdwwvquP4BQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"186532b16d48836d17837e7816465549e195064f7942fdb98f9ceca69e8d87ea","last_reissued_at":"2026-05-18T00:29:46.364059Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:29:46.364059Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1702.08892","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:29:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"y4s4OqsdKGGSN080QeRKPdPPXTHi52vhZZvZ0S0QTEUYowqyu2OmY46kMmSwCK+SudC5uWNG/3gyWBtmRdm9Bw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T20:03:32.790513Z"},"content_sha256":"aecc0bb05f0b66e06eb174ab56d590978740f6fe855e1956e9227125acf3ea13","schema_version":"1.0","event_id":"sha256:aecc0bb05f0b66e06eb174ab56d590978740f6fe855e1956e9227125acf3ea13"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2017:DBSTFMLNJCBW2F4DPZ4BMRSVJH","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Bridging the Gap Between Value and Policy Based Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.AI","authors_text":"Dale Schuurmans, Kelvin Xu, Mohammad Norouzi, Ofir Nachum","submitted_at":"2017-02-28T18:06:15Z","abstract_excerpt":"We establish a new connection between value and policy based reinforcement learning (RL) based on a relationship between softmax temporal value consistency and policy optimality under entropy regularization. Specifically, we show that softmax consistent action values correspond to optimal entropy regularized policy probabilities along any action sequence, regardless of provenance. From this observation, we develop a new RL algorithm, Path Consistency Learning (PCL), that minimizes a notion of soft consistency error along multi-step action sequences extracted from both on- and off-policy traces"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1702.08892","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:29:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"fb9ZBljvgGO24hg8ba6sEVKxQZ/SbJ/0bzGYpGgt3hVM30LFAFIe5fjveoGy03fXRArg8Pbu/KI1SEJD7sSuCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T20:03:32.791066Z"},"content_sha256":"275e83ed2e913236ce4350fc8b0a23032e23acd8162a3678970768327f5e1dac","schema_version":"1.0","event_id":"sha256:275e83ed2e913236ce4350fc8b0a23032e23acd8162a3678970768327f5e1dac"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH/bundle.json","state_url":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-18T20:03:32Z","links":{"resolver":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH","bundle":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH/bundle.json","state":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH/state.json","well_known_bundle":"https://pith.science/.well-known/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2017:DBSTFMLNJCBW2F4DPZ4BMRSVJH","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"0b848298bbeed206e6aad207ae753793ae9e4d1703a0b6dca393a1ea830631ee","cross_cats_sorted":["cs.LG","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2017-02-28T18:06:15Z","title_canon_sha256":"f6d653cf0884a1d721d5514f303319b7107fe7203e73440116e23f7ad20c433f"},"schema_version":"1.0","source":{"id":"1702.08892","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1702.08892","created_at":"2026-05-18T00:29:46Z"},{"alias_kind":"arxiv_version","alias_value":"1702.08892v3","created_at":"2026-05-18T00:29:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1702.08892","created_at":"2026-05-18T00:29:46Z"},{"alias_kind":"pith_short_12","alias_value":"DBSTFMLNJCBW","created_at":"2026-05-18T12:31:10Z"},{"alias_kind":"pith_short_16","alias_value":"DBSTFMLNJCBW2F4D","created_at":"2026-05-18T12:31:10Z"},{"alias_kind":"pith_short_8","alias_value":"DBSTFMLN","created_at":"2026-05-18T12:31:10Z"}],"graph_snapshots":[{"event_id":"sha256:275e83ed2e913236ce4350fc8b0a23032e23acd8162a3678970768327f5e1dac","target":"graph","created_at":"2026-05-18T00:29:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"We establish a new connection between value and policy based reinforcement learning (RL) based on a relationship between softmax temporal value consistency and policy optimality under entropy regularization. Specifically, we show that softmax consistent action values correspond to optimal entropy regularized policy probabilities along any action sequence, regardless of provenance. From this observation, we develop a new RL algorithm, Path Consistency Learning (PCL), that minimizes a notion of soft consistency error along multi-step action sequences extracted from both on- and off-policy traces","authors_text":"Dale Schuurmans, Kelvin Xu, Mohammad Norouzi, Ofir Nachum","cross_cats":["cs.LG","stat.ML"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2017-02-28T18:06:15Z","title":"Bridging the Gap Between Value and Policy Based Reinforcement Learning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1702.08892","kind":"arxiv","version":3},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:aecc0bb05f0b66e06eb174ab56d590978740f6fe855e1956e9227125acf3ea13","target":"record","created_at":"2026-05-18T00:29:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"0b848298bbeed206e6aad207ae753793ae9e4d1703a0b6dca393a1ea830631ee","cross_cats_sorted":["cs.LG","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2017-02-28T18:06:15Z","title_canon_sha256":"f6d653cf0884a1d721d5514f303319b7107fe7203e73440116e23f7ad20c433f"},"schema_version":"1.0","source":{"id":"1702.08892","kind":"arxiv","version":3}},"canonical_sha256":"186532b16d48836d17837e7816465549e195064f7942fdb98f9ceca69e8d87ea","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"186532b16d48836d17837e7816465549e195064f7942fdb98f9ceca69e8d87ea","first_computed_at":"2026-05-18T00:29:46.364059Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T00:29:46.364059Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"+QAO0DhCkJhoSjTd1VVe78cIkOE1CTFYBJDnWcJoiq23bqBVUEYUAox/0Q1jdNqPzGX4c6KlBXnDdwwvquP4BQ==","signature_status":"signed_v1","signed_at":"2026-05-18T00:29:46.364617Z","signed_message":"canonical_sha256_bytes"},"source_id":"1702.08892","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:aecc0bb05f0b66e06eb174ab56d590978740f6fe855e1956e9227125acf3ea13","sha256:275e83ed2e913236ce4350fc8b0a23032e23acd8162a3678970768327f5e1dac"],"state_sha256":"c4dcc90de0ae756b03bc2cc502c1cbc5a850d193d8062086938a3fc65f72b2bb"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"3dS8G/QblOYqclvjv7qv0SD/T9bCw1L9D77dMYgaZwgAxUT558gUtkv6KdkAWgYKgAA9+n1h2jzr3aQYjcojCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-18T20:03:32.792904Z","bundle_sha256":"d0983c4703f9ff19198fd9d4a43d835c9e29e4793fd28182d66d4e5fb89c657c"}}