{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2017:EEJNUWLLGF4XP6EY6FGCTLR4XV","short_pith_number":"pith:EEJNUWLL","canonical_record":{"source":{"id":"1706.03741","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"stat.ML","submitted_at":"2017-06-12T17:23:59Z","cross_cats_sorted":["cs.AI","cs.HC","cs.LG"],"title_canon_sha256":"c26d0dd48abbea12aea6ba91308ea0ab806eda720b3e557933897e49bf30ecc2","abstract_canon_sha256":"ff8e60ebfff031fd0eb18e5acedfde3176fe496e13eae854152798fc2da3d728"},"schema_version":"1.0"},"canonical_sha256":"2112da596b317977f898f14c29ae3cbd56e9403932bbfda3094ee5b2169aad7f","source":{"kind":"arxiv","id":"1706.03741","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1706.03741","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"1706.03741v4","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1706.03741","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"EEJNUWLLGF4X","created_at":"2026-05-18T12:31:12Z"},{"alias_kind":"pith_short_16","alias_value":"EEJNUWLLGF4XP6EY","created_at":"2026-05-18T12:31:12Z"},{"alias_kind":"pith_short_8","alias_value":"EEJNUWLL","created_at":"2026-05-18T12:31:12Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2017:EEJNUWLLGF4XP6EY6FGCTLR4XV","target":"record","payload":{"canonical_record":{"source":{"id":"1706.03741","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"stat.ML","submitted_at":"2017-06-12T17:23:59Z","cross_cats_sorted":["cs.AI","cs.HC","cs.LG"],"title_canon_sha256":"c26d0dd48abbea12aea6ba91308ea0ab806eda720b3e557933897e49bf30ecc2","abstract_canon_sha256":"ff8e60ebfff031fd0eb18e5acedfde3176fe496e13eae854152798fc2da3d728"},"schema_version":"1.0"},"canonical_sha256":"2112da596b317977f898f14c29ae3cbd56e9403932bbfda3094ee5b2169aad7f","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:48.475047Z","signature_b64":"x9nmbELb+EO1o3acyxgfkQYhfsGWhfJHSBfILD7r0Wm9VIE5nx3Vho2nKQcr0ztNQIcA9opfccbEBrB+1eLOBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2112da596b317977f898f14c29ae3cbd56e9403932bbfda3094ee5b2169aad7f","last_reissued_at":"2026-05-17T23:38:48.474584Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:48.474584Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1706.03741","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"I5oLR+gxgTgTUFoL0BDw3POo58B9Vvvs+MvJzvDqHFGWDMRVrKIzbgmAlJOsUhxaHX1gK2gvwMHY+UEaiIXzDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T06:13:58.756038Z"},"content_sha256":"234b2cd730783840f12b8c681619dd813f5058556db8472e64bcc0b523ef4730","schema_version":"1.0","event_id":"sha256:234b2cd730783840f12b8c681619dd813f5058556db8472e64bcc0b523ef4730"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2017:EEJNUWLLGF4XP6EY6FGCTLR4XV","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Deep reinforcement learning from human preferences","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Reinforcement learning agents can learn complex behaviors such as Atari games and robot locomotion from human preferences over pairs of trajectory segments instead of engineered rewards.","cross_cats":["cs.AI","cs.HC","cs.LG"],"primary_cat":"stat.ML","authors_text":"Dario Amodei, Jan Leike, Miljan Martic, Paul Christiano, Shane Legg, Tom B. Brown","submitted_at":"2017-06-12T17:23:59Z","abstract_excerpt":"For sophisticated reinforcement learning (RL) systems to interact usefully with real-world environments, we need to communicate complex goals to these systems. In this work, we explore goals defined in terms of (non-expert) human preferences between pairs of trajectory segments. We show that this approach can effectively solve complex RL tasks without access to the reward function, including Atari games and simulated robot locomotion, while providing feedback on less than one percent of our agent's interactions with the environment. This reduces the cost of human oversight far enough that it c"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We show that this approach can effectively solve complex RL tasks without access to the reward function, including Atari games and simulated robot locomotion, while providing feedback on less than one percent of our agent's interactions with the environment.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That human preferences over short trajectory segments can be consistently modeled by a reward function that generalizes well enough to guide policy optimization without reward hacking or inconsistency on the full task.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Reinforcement learning agents solve complex tasks without access to the reward function by training a reward predictor from human comparisons of trajectory segments, requiring feedback on less than 1% of interactions.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Reinforcement learning agents can learn complex behaviors such as Atari games and robot locomotion from human preferences over pairs of trajectory segments instead of engineered rewards.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"1d1477574e4b56d3f73dce0501982e09c2d303b9151a8b9b668403a8f2105359"},"source":{"id":"1706.03741","kind":"arxiv","version":4},"verdict":{"id":"1ecccbcb-5465-406c-a120-2b4e75d88e59","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T08:35:19.884292Z","strongest_claim":"We show that this approach can effectively solve complex RL tasks without access to the reward function, including Atari games and simulated robot locomotion, while providing feedback on less than one percent of our agent's interactions with the environment.","one_line_summary":"Reinforcement learning agents solve complex tasks without access to the reward function by training a reward predictor from human comparisons of trajectory segments, requiring feedback on less than 1% of interactions.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That human preferences over short trajectory segments can be consistently modeled by a reward function that generalizes well enough to guide policy optimization without reward hacking or inconsistency on the full task.","pith_extraction_headline":"Reinforcement learning agents can learn complex behaviors such as Atari games and robot locomotion from human preferences over pairs of trajectory segments instead of engineered rewards."},"references":{"count":14,"sample":[{"doi":"","year":null,"title":"TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems","work_id":"91f3c09e-dae6-48ca-80c0-463dd1b1f6e1","ref_index":1,"cited_arxiv_id":"1603.04467","is_internal_anchor":true},{"doi":"","year":null,"title":"Concrete Problems in AI Safety","work_id":"c8d14fbe-6eab-464a-95b3-778aabd82fa3","ref_index":2,"cited_arxiv_id":"1606.06565","is_internal_anchor":true},{"doi":"","year":2010,"title":"A bayesian interactive optimization approach to procedural animation design","work_id":"a2ea06cf-ee83-47a5-846a-a82273829a4d","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Greg Brockman, Vicki Cheung, Ludwig Pettersson, Jonas Schneider, John Schulman, Jie Tang, and Wojciech Zaremba. OpenAI Gym. arXiv preprint arXiv:1606.01540,","work_id":"6af98f3f-f074-41ae-a689-7dd7b4b8efde","ref_index":4,"cited_arxiv_id":"1606.01540","is_internal_anchor":true},{"doi":"","year":null,"title":"Deep Q-learning from Demonstrations","work_id":"3d67a954-e5a3-409f-a53c-3100d6063c5f","ref_index":5,"cited_arxiv_id":"1704.03732","is_internal_anchor":true}],"resolved_work":14,"snapshot_sha256":"fecc59eb2cfc6f5c54d2665625f562214f2620dcf5d38746a9a0add16ff206e5","internal_anchors":6},"formal_canon":{"evidence_count":2,"snapshot_sha256":"856c0889dbc7c0af35cd064d7e456d5bc26700aae668d11f3d0b896725ccf03e"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"1ecccbcb-5465-406c-a120-2b4e75d88e59"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"InmpA8oHRjRwbk2wgy4cw0Pw4XKiXPUh3mkPZrU7b5KT0G/GfhSUroKYJ9LEAiSquq0xMrT++YcMCQlS6xbZAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T06:13:58.756605Z"},"content_sha256":"5b92d141149cc2c72aca3db5aa3164a4e2123762ed4a036660a3f848de2f9dec","schema_version":"1.0","event_id":"sha256:5b92d141149cc2c72aca3db5aa3164a4e2123762ed4a036660a3f848de2f9dec"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/EEJNUWLLGF4XP6EY6FGCTLR4XV/bundle.json","state_url":"https://pith.science/pith/EEJNUWLLGF4XP6EY6FGCTLR4XV/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/EEJNUWLLGF4XP6EY6FGCTLR4XV/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T06:13:58Z","links":{"resolver":"https://pith.science/pith/EEJNUWLLGF4XP6EY6FGCTLR4XV","bundle":"https://pith.science/pith/EEJNUWLLGF4XP6EY6FGCTLR4XV/bundle.json","state":"https://pith.science/pith/EEJNUWLLGF4XP6EY6FGCTLR4XV/state.json","well_known_bundle":"https://pith.science/.well-known/pith/EEJNUWLLGF4XP6EY6FGCTLR4XV/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2017:EEJNUWLLGF4XP6EY6FGCTLR4XV","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"ff8e60ebfff031fd0eb18e5acedfde3176fe496e13eae854152798fc2da3d728","cross_cats_sorted":["cs.AI","cs.HC","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"stat.ML","submitted_at":"2017-06-12T17:23:59Z","title_canon_sha256":"c26d0dd48abbea12aea6ba91308ea0ab806eda720b3e557933897e49bf30ecc2"},"schema_version":"1.0","source":{"id":"1706.03741","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1706.03741","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"1706.03741v4","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1706.03741","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"EEJNUWLLGF4X","created_at":"2026-05-18T12:31:12Z"},{"alias_kind":"pith_short_16","alias_value":"EEJNUWLLGF4XP6EY","created_at":"2026-05-18T12:31:12Z"},{"alias_kind":"pith_short_8","alias_value":"EEJNUWLL","created_at":"2026-05-18T12:31:12Z"}],"graph_snapshots":[{"event_id":"sha256:5b92d141149cc2c72aca3db5aa3164a4e2123762ed4a036660a3f848de2f9dec","target":"graph","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We show that this approach can effectively solve complex RL tasks without access to the reward function, including Atari games and simulated robot locomotion, while providing feedback on less than one percent of our agent's interactions with the environment."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That human preferences over short trajectory segments can be consistently modeled by a reward function that generalizes well enough to guide policy optimization without reward hacking or inconsistency on the full task."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Reinforcement learning agents solve complex tasks without access to the reward function by training a reward predictor from human comparisons of trajectory segments, requiring feedback on less than 1% of interactions."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Reinforcement learning agents can learn complex behaviors such as Atari games and robot locomotion from human preferences over pairs of trajectory segments instead of engineered rewards."}],"snapshot_sha256":"1d1477574e4b56d3f73dce0501982e09c2d303b9151a8b9b668403a8f2105359"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"856c0889dbc7c0af35cd064d7e456d5bc26700aae668d11f3d0b896725ccf03e"},"paper":{"abstract_excerpt":"For sophisticated reinforcement learning (RL) systems to interact usefully with real-world environments, we need to communicate complex goals to these systems. In this work, we explore goals defined in terms of (non-expert) human preferences between pairs of trajectory segments. We show that this approach can effectively solve complex RL tasks without access to the reward function, including Atari games and simulated robot locomotion, while providing feedback on less than one percent of our agent's interactions with the environment. This reduces the cost of human oversight far enough that it c","authors_text":"Dario Amodei, Jan Leike, Miljan Martic, Paul Christiano, Shane Legg, Tom B. Brown","cross_cats":["cs.AI","cs.HC","cs.LG"],"headline":"Reinforcement learning agents can learn complex behaviors such as Atari games and robot locomotion from human preferences over pairs of trajectory segments instead of engineered rewards.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"stat.ML","submitted_at":"2017-06-12T17:23:59Z","title":"Deep reinforcement learning from human preferences"},"references":{"count":14,"internal_anchors":6,"resolved_work":14,"sample":[{"cited_arxiv_id":"1603.04467","doi":"","is_internal_anchor":true,"ref_index":1,"title":"TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems","work_id":"91f3c09e-dae6-48ca-80c0-463dd1b1f6e1","year":null},{"cited_arxiv_id":"1606.06565","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Concrete Problems in AI Safety","work_id":"c8d14fbe-6eab-464a-95b3-778aabd82fa3","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"A bayesian interactive optimization approach to procedural animation design","work_id":"a2ea06cf-ee83-47a5-846a-a82273829a4d","year":2010},{"cited_arxiv_id":"1606.01540","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Greg Brockman, Vicki Cheung, Ludwig Pettersson, Jonas Schneider, John Schulman, Jie Tang, and Wojciech Zaremba. OpenAI Gym. arXiv preprint arXiv:1606.01540,","work_id":"6af98f3f-f074-41ae-a689-7dd7b4b8efde","year":null},{"cited_arxiv_id":"1704.03732","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Deep Q-learning from Demonstrations","work_id":"3d67a954-e5a3-409f-a53c-3100d6063c5f","year":null}],"snapshot_sha256":"fecc59eb2cfc6f5c54d2665625f562214f2620dcf5d38746a9a0add16ff206e5"},"source":{"id":"1706.03741","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-16T08:35:19.884292Z","id":"1ecccbcb-5465-406c-a120-2b4e75d88e59","model_set":{"reader":"grok-4.3"},"one_line_summary":"Reinforcement learning agents solve complex tasks without access to the reward function by training a reward predictor from human comparisons of trajectory segments, requiring feedback on less than 1% of interactions.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Reinforcement learning agents can learn complex behaviors such as Atari games and robot locomotion from human preferences over pairs of trajectory segments instead of engineered rewards.","strongest_claim":"We show that this approach can effectively solve complex RL tasks without access to the reward function, including Atari games and simulated robot locomotion, while providing feedback on less than one percent of our agent's interactions with the environment.","weakest_assumption":"That human preferences over short trajectory segments can be consistently modeled by a reward function that generalizes well enough to guide policy optimization without reward hacking or inconsistency on the full task."}},"verdict_id":"1ecccbcb-5465-406c-a120-2b4e75d88e59"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:234b2cd730783840f12b8c681619dd813f5058556db8472e64bcc0b523ef4730","target":"record","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"ff8e60ebfff031fd0eb18e5acedfde3176fe496e13eae854152798fc2da3d728","cross_cats_sorted":["cs.AI","cs.HC","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"stat.ML","submitted_at":"2017-06-12T17:23:59Z","title_canon_sha256":"c26d0dd48abbea12aea6ba91308ea0ab806eda720b3e557933897e49bf30ecc2"},"schema_version":"1.0","source":{"id":"1706.03741","kind":"arxiv","version":4}},"canonical_sha256":"2112da596b317977f898f14c29ae3cbd56e9403932bbfda3094ee5b2169aad7f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"2112da596b317977f898f14c29ae3cbd56e9403932bbfda3094ee5b2169aad7f","first_computed_at":"2026-05-17T23:38:48.474584Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:48.474584Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"x9nmbELb+EO1o3acyxgfkQYhfsGWhfJHSBfILD7r0Wm9VIE5nx3Vho2nKQcr0ztNQIcA9opfccbEBrB+1eLOBQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:48.475047Z","signed_message":"canonical_sha256_bytes"},"source_id":"1706.03741","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:234b2cd730783840f12b8c681619dd813f5058556db8472e64bcc0b523ef4730","sha256:5b92d141149cc2c72aca3db5aa3164a4e2123762ed4a036660a3f848de2f9dec"],"state_sha256":"531c71400819c28cc39cd8f545f413f49d38e794ab69d5d8a24a71249700fe34"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"q8LAMSxGzG/uwJcHyLEp2bN9fAaArOVxi2u6sXZOWBKOz3J6IySDGKk5T4evdNSJgh76df6u4zATeCFKX1gMAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T06:13:58.759372Z","bundle_sha256":"a397b7ea4f2acc335309902db7e24a85236b149f3dd55a60f33577d4c7cf6895"}}