{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:DBSTFMLNJCBW2F4DPZ4BMRSVJH","short_pith_number":"pith:DBSTFMLN","schema_version":"1.0","canonical_sha256":"186532b16d48836d17837e7816465549e195064f7942fdb98f9ceca69e8d87ea","source":{"kind":"arxiv","id":"1702.08892","version":3},"attestation_state":"computed","paper":{"title":"Bridging the Gap Between Value and Policy Based Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.AI","authors_text":"Dale Schuurmans, Kelvin Xu, Mohammad Norouzi, Ofir Nachum","submitted_at":"2017-02-28T18:06:15Z","abstract_excerpt":"We establish a new connection between value and policy based reinforcement learning (RL) based on a relationship between softmax temporal value consistency and policy optimality under entropy regularization. Specifically, we show that softmax consistent action values correspond to optimal entropy regularized policy probabilities along any action sequence, regardless of provenance. From this observation, we develop a new RL algorithm, Path Consistency Learning (PCL), that minimizes a notion of soft consistency error along multi-step action sequences extracted from both on- and off-policy traces"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1702.08892","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2017-02-28T18:06:15Z","cross_cats_sorted":["cs.LG","stat.ML"],"title_canon_sha256":"f6d653cf0884a1d721d5514f303319b7107fe7203e73440116e23f7ad20c433f","abstract_canon_sha256":"0b848298bbeed206e6aad207ae753793ae9e4d1703a0b6dca393a1ea830631ee"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:29:46.364617Z","signature_b64":"+QAO0DhCkJhoSjTd1VVe78cIkOE1CTFYBJDnWcJoiq23bqBVUEYUAox/0Q1jdNqPzGX4c6KlBXnDdwwvquP4BQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"186532b16d48836d17837e7816465549e195064f7942fdb98f9ceca69e8d87ea","last_reissued_at":"2026-05-18T00:29:46.364059Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:29:46.364059Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Bridging the Gap Between Value and Policy Based Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.AI","authors_text":"Dale Schuurmans, Kelvin Xu, Mohammad Norouzi, Ofir Nachum","submitted_at":"2017-02-28T18:06:15Z","abstract_excerpt":"We establish a new connection between value and policy based reinforcement learning (RL) based on a relationship between softmax temporal value consistency and policy optimality under entropy regularization. Specifically, we show that softmax consistent action values correspond to optimal entropy regularized policy probabilities along any action sequence, regardless of provenance. From this observation, we develop a new RL algorithm, Path Consistency Learning (PCL), that minimizes a notion of soft consistency error along multi-step action sequences extracted from both on- and off-policy traces"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1702.08892","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1702.08892","created_at":"2026-05-18T00:29:46.364169+00:00"},{"alias_kind":"arxiv_version","alias_value":"1702.08892v3","created_at":"2026-05-18T00:29:46.364169+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1702.08892","created_at":"2026-05-18T00:29:46.364169+00:00"},{"alias_kind":"pith_short_12","alias_value":"DBSTFMLNJCBW","created_at":"2026-05-18T12:31:10.602751+00:00"},{"alias_kind":"pith_short_16","alias_value":"DBSTFMLNJCBW2F4D","created_at":"2026-05-18T12:31:10.602751+00:00"},{"alias_kind":"pith_short_8","alias_value":"DBSTFMLN","created_at":"2026-05-18T12:31:10.602751+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2604.08944","citing_title":"Multi-Agent Decision-Focused Learning via Value-Aware Sequential Communication","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08944","citing_title":"Multi-Agent Decision-Focused Learning via Value-Aware Sequential Communication","ref_index":1,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH","json":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH.json","graph_json":"https://pith.science/api/pith-number/DBSTFMLNJCBW2F4DPZ4BMRSVJH/graph.json","events_json":"https://pith.science/api/pith-number/DBSTFMLNJCBW2F4DPZ4BMRSVJH/events.json","paper":"https://pith.science/paper/DBSTFMLN"},"agent_actions":{"view_html":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH","download_json":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH.json","view_paper":"https://pith.science/paper/DBSTFMLN","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1702.08892&json=true","fetch_graph":"https://pith.science/api/pith-number/DBSTFMLNJCBW2F4DPZ4BMRSVJH/graph.json","fetch_events":"https://pith.science/api/pith-number/DBSTFMLNJCBW2F4DPZ4BMRSVJH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH/action/storage_attestation","attest_author":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH/action/author_attestation","sign_citation":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH/action/citation_signature","submit_replication":"https://pith.science/pith/DBSTFMLNJCBW2F4DPZ4BMRSVJH/action/replication_record"}},"created_at":"2026-05-18T00:29:46.364169+00:00","updated_at":"2026-05-18T00:29:46.364169+00:00"}