{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:3TPZDPAYLQPR6Q7C4E64OYGFAD","short_pith_number":"pith:3TPZDPAY","schema_version":"1.0","canonical_sha256":"dcdf91bc185c1f1f43e2e13dc760c500c66ccef12a2faf13c9222f170c7272f2","source":{"kind":"arxiv","id":"1806.01186","version":2},"attestation_state":"computed","paper":{"title":"Penalizing side effects using stepwise relative reachability","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.LG","authors_text":"Laurent Orseau, Miljan Martic, Ramana Kumar, Shane Legg, Victoria Krakovna","submitted_at":"2018-06-04T16:30:17Z","abstract_excerpt":"How can we design safe reinforcement learning agents that avoid unnecessary disruptions to their environment? We show that current approaches to penalizing side effects can introduce bad incentives, e.g. to prevent any irreversible changes in the environment, including the actions of other agents. To isolate the source of such undesirable incentives, we break down side effects penalties into two components: a baseline state and a measure of deviation from this baseline state. We argue that some of these incentives arise from the choice of baseline, and others arise from the choice of deviation"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1806.01186","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-06-04T16:30:17Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"d930486a05655ae2800ed54cb8293b41a93b1c0dbb79e929e572df1c6a19dea8","abstract_canon_sha256":"f1645414f0224b5ca516807347a6c3400ece7117ff8b970463f750ced46cd0e3"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:51:49.350724Z","signature_b64":"bnCFwHivwaDQ6Qrb+MDssfDiJ/jdJQNsymeIWU73WrRKJ6xRHFaOjMY9YQZ43f3Z67ij2TFaisHD57EtbILyBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"dcdf91bc185c1f1f43e2e13dc760c500c66ccef12a2faf13c9222f170c7272f2","last_reissued_at":"2026-05-17T23:51:49.350034Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:51:49.350034Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Penalizing side effects using stepwise relative reachability","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.LG","authors_text":"Laurent Orseau, Miljan Martic, Ramana Kumar, Shane Legg, Victoria Krakovna","submitted_at":"2018-06-04T16:30:17Z","abstract_excerpt":"How can we design safe reinforcement learning agents that avoid unnecessary disruptions to their environment? We show that current approaches to penalizing side effects can introduce bad incentives, e.g. to prevent any irreversible changes in the environment, including the actions of other agents. To isolate the source of such undesirable incentives, we break down side effects penalties into two components: a baseline state and a measure of deviation from this baseline state. We argue that some of these incentives arise from the choice of baseline, and others arise from the choice of deviation"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1806.01186","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1806.01186","created_at":"2026-05-17T23:51:49.350127+00:00"},{"alias_kind":"arxiv_version","alias_value":"1806.01186v2","created_at":"2026-05-17T23:51:49.350127+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1806.01186","created_at":"2026-05-17T23:51:49.350127+00:00"},{"alias_kind":"pith_short_12","alias_value":"3TPZDPAYLQPR","created_at":"2026-05-18T12:32:05.422762+00:00"},{"alias_kind":"pith_short_16","alias_value":"3TPZDPAYLQPR6Q7C","created_at":"2026-05-18T12:32:05.422762+00:00"},{"alias_kind":"pith_short_8","alias_value":"3TPZDPAY","created_at":"2026-05-18T12:32:05.422762+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"1906.10918","citing_title":"Towards Empathic Deep Q-Learning","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"1907.01285","citing_title":"Learning the Arrow of Time","ref_index":10,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/3TPZDPAYLQPR6Q7C4E64OYGFAD","json":"https://pith.science/pith/3TPZDPAYLQPR6Q7C4E64OYGFAD.json","graph_json":"https://pith.science/api/pith-number/3TPZDPAYLQPR6Q7C4E64OYGFAD/graph.json","events_json":"https://pith.science/api/pith-number/3TPZDPAYLQPR6Q7C4E64OYGFAD/events.json","paper":"https://pith.science/paper/3TPZDPAY"},"agent_actions":{"view_html":"https://pith.science/pith/3TPZDPAYLQPR6Q7C4E64OYGFAD","download_json":"https://pith.science/pith/3TPZDPAYLQPR6Q7C4E64OYGFAD.json","view_paper":"https://pith.science/paper/3TPZDPAY","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1806.01186&json=true","fetch_graph":"https://pith.science/api/pith-number/3TPZDPAYLQPR6Q7C4E64OYGFAD/graph.json","fetch_events":"https://pith.science/api/pith-number/3TPZDPAYLQPR6Q7C4E64OYGFAD/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/3TPZDPAYLQPR6Q7C4E64OYGFAD/action/timestamp_anchor","attest_storage":"https://pith.science/pith/3TPZDPAYLQPR6Q7C4E64OYGFAD/action/storage_attestation","attest_author":"https://pith.science/pith/3TPZDPAYLQPR6Q7C4E64OYGFAD/action/author_attestation","sign_citation":"https://pith.science/pith/3TPZDPAYLQPR6Q7C4E64OYGFAD/action/citation_signature","submit_replication":"https://pith.science/pith/3TPZDPAYLQPR6Q7C4E64OYGFAD/action/replication_record"}},"created_at":"2026-05-17T23:51:49.350127+00:00","updated_at":"2026-05-17T23:51:49.350127+00:00"}