{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:RAJTRGZF7NXD36QJPDMCDKHAOD","short_pith_number":"pith:RAJTRGZF","schema_version":"1.0","canonical_sha256":"8813389b25fb6e3dfa0978d821a8e070ff09b23c11ea0cd2adf5ba406a352ec6","source":{"kind":"arxiv","id":"1811.07871","version":1},"attestation_state":"computed","paper":{"title":"Scalable agent alignment via reward modeling: a research direction","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.NE","stat.ML"],"primary_cat":"cs.LG","authors_text":"David Krueger, Jan Leike, Miljan Martic, Shane Legg, Tom Everitt, Vishal Maini","submitted_at":"2018-11-19T18:48:04Z","abstract_excerpt":"One obstacle to applying reinforcement learning algorithms to real-world problems is the lack of suitable reward functions. Designing such reward functions is difficult in part because the user only has an implicit understanding of the task objective. This gives rise to the agent alignment problem: how do we create agents that behave in accordance with the user's intentions? We outline a high-level research direction to solve the agent alignment problem centered around reward modeling: learning a reward function from interaction with the user and optimizing the learned reward function with rei"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1811.07871","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2018-11-19T18:48:04Z","cross_cats_sorted":["cs.AI","cs.NE","stat.ML"],"title_canon_sha256":"0e981f58147c8ecca45d1070525bd7b968b1695eb3fd0d49ccd5a4c6d5de8f7b","abstract_canon_sha256":"99e364a7b12594cea686a095c8b91340f1cdb3ba51f02d31a702967d9c98cb93"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:00:23.247341Z","signature_b64":"Q4BTU0k028VNl6oxRJdhoO/K7Uq3sEjHEsQ5URwIN1Nu8Sd5TWsGaGBCxz0AZsG3+ttheVh0l6k5iOEwYBGxBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8813389b25fb6e3dfa0978d821a8e070ff09b23c11ea0cd2adf5ba406a352ec6","last_reissued_at":"2026-05-18T00:00:23.246793Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:00:23.246793Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Scalable agent alignment via reward modeling: a research direction","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.NE","stat.ML"],"primary_cat":"cs.LG","authors_text":"David Krueger, Jan Leike, Miljan Martic, Shane Legg, Tom Everitt, Vishal Maini","submitted_at":"2018-11-19T18:48:04Z","abstract_excerpt":"One obstacle to applying reinforcement learning algorithms to real-world problems is the lack of suitable reward functions. Designing such reward functions is difficult in part because the user only has an implicit understanding of the task objective. This gives rise to the agent alignment problem: how do we create agents that behave in accordance with the user's intentions? We outline a high-level research direction to solve the agent alignment problem centered around reward modeling: learning a reward function from interaction with the user and optimizing the learned reward function with rei"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1811.07871","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1811.07871","created_at":"2026-05-18T00:00:23.246867+00:00"},{"alias_kind":"arxiv_version","alias_value":"1811.07871v1","created_at":"2026-05-18T00:00:23.246867+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1811.07871","created_at":"2026-05-18T00:00:23.246867+00:00"},{"alias_kind":"pith_short_12","alias_value":"RAJTRGZF7NXD","created_at":"2026-05-18T12:32:50.500415+00:00"},{"alias_kind":"pith_short_16","alias_value":"RAJTRGZF7NXD36QJ","created_at":"2026-05-18T12:32:50.500415+00:00"},{"alias_kind":"pith_short_8","alias_value":"RAJTRGZF","created_at":"2026-05-18T12:32:50.500415+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":30,"internal_anchor_count":19,"sample":[{"citing_arxiv_id":"1906.08663","citing_title":"Modeling AGI Safety Frameworks with Causal Influence Diagrams","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"1906.10918","citing_title":"Towards Empathic Deep Q-Learning","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"1907.00430","citing_title":"Requisite Variety in Ethical Utility Functions for AI Value Alignment","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2307.15043","citing_title":"Universal and Transferable Adversarial Attacks on Aligned Language Models","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2504.12501","citing_title":"Reinforcement Learning from Human Feedback","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2404.13076","citing_title":"LLM Evaluators Recognize and Favor Their Own Generations","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14588","citing_title":"Silent Collapse in Recursive Learning Systems","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15300","citing_title":"Deep Pre-Alignment for VLMs","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2210.10760","citing_title":"Scaling Laws for Reward Model Overoptimization","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2508.06412","citing_title":"Sample-efficient LLM Optimization with Reset Replay","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2408.00724","citing_title":"Inference Scaling Laws: An Empirical Analysis of Compute-Optimal Inference for Problem-Solving with Language Models","ref_index":278,"is_internal_anchor":true},{"citing_arxiv_id":"2009.01325","citing_title":"Learning to summarize from human feedback","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2304.06767","citing_title":"RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment","ref_index":114,"is_internal_anchor":true},{"citing_arxiv_id":"2211.03540","citing_title":"Measuring Progress on Scalable Oversight for Large Language Models","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2402.05070","citing_title":"A Roadmap to Pluralistic Alignment","ref_index":271,"is_internal_anchor":true},{"citing_arxiv_id":"2212.03827","citing_title":"Discovering Latent Knowledge in Language Models Without Supervision","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"1906.01820","citing_title":"Risks from Learned Optimization in Advanced Machine Learning Systems","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2603.18633","citing_title":"An Onto-Relational-Sophic Framework for Governing Synthetic Minds","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12809","citing_title":"Correcting Influence: Unboxing LLM Outputs with Orthogonal Latent Spaces","ref_index":222,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24155","citing_title":"The Alignment Target Problem: Divergent Moral Judgments of Humans, AI Systems, and Their Designers","ref_index":32,"is_internal_anchor":false},{"citing_arxiv_id":"2605.03512","citing_title":"Brainrot: Deskilling and Addiction are Overlooked AI Risks","ref_index":59,"is_internal_anchor":false},{"citing_arxiv_id":"2604.26360","citing_title":"Uncertainty-Aware Reward Discounting for Mitigating Reward Hacking","ref_index":13,"is_internal_anchor":false},{"citing_arxiv_id":"2605.01643","citing_title":"AI Alignment via Incentives and Correction","ref_index":35,"is_internal_anchor":false},{"citing_arxiv_id":"2604.24155","citing_title":"The Alignment Target Problem: Divergent Moral Judgments of Humans, AI Systems, and Their Designers","ref_index":32,"is_internal_anchor":false},{"citing_arxiv_id":"2502.01456","citing_title":"Process Reinforcement through Implicit Rewards","ref_index":69,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/RAJTRGZF7NXD36QJPDMCDKHAOD","json":"https://pith.science/pith/RAJTRGZF7NXD36QJPDMCDKHAOD.json","graph_json":"https://pith.science/api/pith-number/RAJTRGZF7NXD36QJPDMCDKHAOD/graph.json","events_json":"https://pith.science/api/pith-number/RAJTRGZF7NXD36QJPDMCDKHAOD/events.json","paper":"https://pith.science/paper/RAJTRGZF"},"agent_actions":{"view_html":"https://pith.science/pith/RAJTRGZF7NXD36QJPDMCDKHAOD","download_json":"https://pith.science/pith/RAJTRGZF7NXD36QJPDMCDKHAOD.json","view_paper":"https://pith.science/paper/RAJTRGZF","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1811.07871&json=true","fetch_graph":"https://pith.science/api/pith-number/RAJTRGZF7NXD36QJPDMCDKHAOD/graph.json","fetch_events":"https://pith.science/api/pith-number/RAJTRGZF7NXD36QJPDMCDKHAOD/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/RAJTRGZF7NXD36QJPDMCDKHAOD/action/timestamp_anchor","attest_storage":"https://pith.science/pith/RAJTRGZF7NXD36QJPDMCDKHAOD/action/storage_attestation","attest_author":"https://pith.science/pith/RAJTRGZF7NXD36QJPDMCDKHAOD/action/author_attestation","sign_citation":"https://pith.science/pith/RAJTRGZF7NXD36QJPDMCDKHAOD/action/citation_signature","submit_replication":"https://pith.science/pith/RAJTRGZF7NXD36QJPDMCDKHAOD/action/replication_record"}},"created_at":"2026-05-18T00:00:23.246867+00:00","updated_at":"2026-05-18T00:00:23.246867+00:00"}