{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:OY6UIUB3I5I3RU2N7ZVQQZOTTO","short_pith_number":"pith:OY6UIUB3","schema_version":"1.0","canonical_sha256":"763d44503b4751b8d34dfe6b0865d39b8c68e6f41ca8e73089ab997924d53d9d","source":{"kind":"arxiv","id":"1805.11074","version":3},"attestation_state":"computed","paper":{"title":"Reward Constrained Policy Optimization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.LG","authors_text":"Chen Tessler, Daniel J. Mankowitz, Shie Mannor","submitted_at":"2018-05-28T17:31:11Z","abstract_excerpt":"Solving tasks in Reinforcement Learning is no easy feat. As the goal of the agent is to maximize the accumulated reward, it often learns to exploit loopholes and misspecifications in the reward signal resulting in unwanted behavior. While constraints may solve this issue, there is no closed form solution for general constraints. In this work we present a novel multi-timescale approach for constrained policy optimization, called `Reward Constrained Policy Optimization' (RCPO), which uses an alternative penalty signal to guide the policy towards a constraint satisfying one. We prove the converge"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1805.11074","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-05-28T17:31:11Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"6ad4503a61e54aa72d763a4654ce6df6dbdae9b9f11d9fdc8100fb581baa447a","abstract_canon_sha256":"e8c4da5368282395e3a8694b960f3bfa5251ba844a65fe5e68c2005f31c6d322"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:57:31.896191Z","signature_b64":"DnV0aGSk8yWGXUvEfa80jXX9kd6mHtRegM4R0N6wi3IBy410y2DtT6LlWwuxxGjqZ1m+Wc2pco8QkE33GXy1Cw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"763d44503b4751b8d34dfe6b0865d39b8c68e6f41ca8e73089ab997924d53d9d","last_reissued_at":"2026-05-17T23:57:31.895745Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:57:31.895745Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Reward Constrained Policy Optimization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.LG","authors_text":"Chen Tessler, Daniel J. Mankowitz, Shie Mannor","submitted_at":"2018-05-28T17:31:11Z","abstract_excerpt":"Solving tasks in Reinforcement Learning is no easy feat. As the goal of the agent is to maximize the accumulated reward, it often learns to exploit loopholes and misspecifications in the reward signal resulting in unwanted behavior. While constraints may solve this issue, there is no closed form solution for general constraints. In this work we present a novel multi-timescale approach for constrained policy optimization, called `Reward Constrained Policy Optimization' (RCPO), which uses an alternative penalty signal to guide the policy towards a constraint satisfying one. We prove the converge"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1805.11074","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1805.11074","created_at":"2026-05-17T23:57:31.895823+00:00"},{"alias_kind":"arxiv_version","alias_value":"1805.11074v3","created_at":"2026-05-17T23:57:31.895823+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1805.11074","created_at":"2026-05-17T23:57:31.895823+00:00"},{"alias_kind":"pith_short_12","alias_value":"OY6UIUB3I5I3","created_at":"2026-05-18T12:32:43.782077+00:00"},{"alias_kind":"pith_short_16","alias_value":"OY6UIUB3I5I3RU2N","created_at":"2026-05-18T12:32:43.782077+00:00"},{"alias_kind":"pith_short_8","alias_value":"OY6UIUB3","created_at":"2026-05-18T12:32:43.782077+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":15,"internal_anchor_count":7,"sample":[{"citing_arxiv_id":"2605.11975","citing_title":"Stochastic Minimum-Cost Reach-Avoid Reinforcement Learning","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16709","citing_title":"Covert Multi-bit LLM Watermarking: An Information Theory and Coding Approach","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2510.11491","citing_title":"Constraint-Aware Reinforcement Learning via Adaptive Action Scaling","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2511.14135","citing_title":"AdaFair-MARL: Enforcing Adaptive Fairness Constraints in Multi-Agent Reinforcement Learning","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2602.02924","citing_title":"How Does the Lagrangian Guide Safe Reinforcement Learning through Diffusion Models?","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14174","citing_title":"Safety-Constrained Reinforcement Learning with Post-Training Reachability Verification for Robot Navigation","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14246","citing_title":"Action-Conditioned Risk Gating for Safety-Critical Control under Partial Observability","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11688","citing_title":"Shaping Zero-Shot Coordination via State Blocking","ref_index":43,"is_internal_anchor":false},{"citing_arxiv_id":"2605.11975","citing_title":"Stochastic Minimum-Cost Reach-Avoid Reinforcement Learning","ref_index":11,"is_internal_anchor":false},{"citing_arxiv_id":"2605.10481","citing_title":"Safe Multi-Agent Behavior Must Be Maintained, Not Merely Asserted: Constraint Drift in LLM-Based Multi-Agent Systems","ref_index":37,"is_internal_anchor":false},{"citing_arxiv_id":"2604.19024","citing_title":"Policy Gradient Primal-Dual Method for Safe Reinforcement Learning from Human Feedback","ref_index":47,"is_internal_anchor":false},{"citing_arxiv_id":"2604.12667","citing_title":"Safe reinforcement learning with online filtering for fatigue-predictive human-robot task planning and allocation in production","ref_index":73,"is_internal_anchor":false},{"citing_arxiv_id":"2604.07457","citing_title":"CMP: Robust Whole-Body Tracking for Loco-Manipulation via Competence Manifold Projection","ref_index":47,"is_internal_anchor":false},{"citing_arxiv_id":"2604.06522","citing_title":"Constrained Policy Optimization for Provably Fair Order Matching","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2605.06992","citing_title":"Why Does Agentic Safety Fail to Generalize Across Tasks?","ref_index":105,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/OY6UIUB3I5I3RU2N7ZVQQZOTTO","json":"https://pith.science/pith/OY6UIUB3I5I3RU2N7ZVQQZOTTO.json","graph_json":"https://pith.science/api/pith-number/OY6UIUB3I5I3RU2N7ZVQQZOTTO/graph.json","events_json":"https://pith.science/api/pith-number/OY6UIUB3I5I3RU2N7ZVQQZOTTO/events.json","paper":"https://pith.science/paper/OY6UIUB3"},"agent_actions":{"view_html":"https://pith.science/pith/OY6UIUB3I5I3RU2N7ZVQQZOTTO","download_json":"https://pith.science/pith/OY6UIUB3I5I3RU2N7ZVQQZOTTO.json","view_paper":"https://pith.science/paper/OY6UIUB3","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1805.11074&json=true","fetch_graph":"https://pith.science/api/pith-number/OY6UIUB3I5I3RU2N7ZVQQZOTTO/graph.json","fetch_events":"https://pith.science/api/pith-number/OY6UIUB3I5I3RU2N7ZVQQZOTTO/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/OY6UIUB3I5I3RU2N7ZVQQZOTTO/action/timestamp_anchor","attest_storage":"https://pith.science/pith/OY6UIUB3I5I3RU2N7ZVQQZOTTO/action/storage_attestation","attest_author":"https://pith.science/pith/OY6UIUB3I5I3RU2N7ZVQQZOTTO/action/author_attestation","sign_citation":"https://pith.science/pith/OY6UIUB3I5I3RU2N7ZVQQZOTTO/action/citation_signature","submit_replication":"https://pith.science/pith/OY6UIUB3I5I3RU2N7ZVQQZOTTO/action/replication_record"}},"created_at":"2026-05-17T23:57:31.895823+00:00","updated_at":"2026-05-17T23:57:31.895823+00:00"}