{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:VCVBSE4H6XPFVBCOWYCCH2QHWC","short_pith_number":"pith:VCVBSE4H","canonical_record":{"source":{"id":"2602.12124","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-12T16:13:14Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"7a91dc0f39ba595c81a38e52f34f8b419a11ddade4c9069b8da3597473cf1fd2","abstract_canon_sha256":"b2c69ef8a07714f021bb55f1621ed06d5f696738863c5ff8e85f32df2103ff14"},"schema_version":"1.0"},"canonical_sha256":"a8aa191387f5de5a844eb60423ea07b082ae46d44a6ae5cc22be3012de181cba","source":{"kind":"arxiv","id":"2602.12124","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.12124","created_at":"2026-06-05T01:15:20Z"},{"alias_kind":"arxiv_version","alias_value":"2602.12124v2","created_at":"2026-06-05T01:15:20Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.12124","created_at":"2026-06-05T01:15:20Z"},{"alias_kind":"pith_short_12","alias_value":"VCVBSE4H6XPF","created_at":"2026-06-05T01:15:20Z"},{"alias_kind":"pith_short_16","alias_value":"VCVBSE4H6XPFVBCO","created_at":"2026-06-05T01:15:20Z"},{"alias_kind":"pith_short_8","alias_value":"VCVBSE4H","created_at":"2026-06-05T01:15:20Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:VCVBSE4H6XPFVBCOWYCCH2QHWC","target":"record","payload":{"canonical_record":{"source":{"id":"2602.12124","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-12T16:13:14Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"7a91dc0f39ba595c81a38e52f34f8b419a11ddade4c9069b8da3597473cf1fd2","abstract_canon_sha256":"b2c69ef8a07714f021bb55f1621ed06d5f696738863c5ff8e85f32df2103ff14"},"schema_version":"1.0"},"canonical_sha256":"a8aa191387f5de5a844eb60423ea07b082ae46d44a6ae5cc22be3012de181cba","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-05T01:15:20.153292Z","signature_b64":"/Q0aywJlPQvHsf6uk8SsjE68BhBaoOjs8UwjVtUJUP43/jAS6LAOwP8Mo3ghb0FIPrmGxQZQrNxgNyRaFskmAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a8aa191387f5de5a844eb60423ea07b082ae46d44a6ae5cc22be3012de181cba","last_reissued_at":"2026-06-05T01:15:20.152776Z","signature_status":"signed_v1","first_computed_at":"2026-06-05T01:15:20.152776Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.12124","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-05T01:15:20Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"W6qyJnwQI1rDn73WDIXvB6492E4kzp9zUSKy5fqZGI5CkIfgFn40Y2ZtiPKA+VfzOu45u/umNAod2CVYSkpBCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T21:16:49.339978Z"},"content_sha256":"3b454f32153310babb929c045e3078d4d0c381cf99ac5ef5e126610320a34774","schema_version":"1.0","event_id":"sha256:3b454f32153310babb929c045e3078d4d0c381cf99ac5ef5e126610320a34774"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:VCVBSE4H6XPFVBCOWYCCH2QHWC","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Alignment Risks from Capability-Seeking RL Training","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.LG","authors_text":"Han Bao, Kehan Guo, Nitesh V Chawla, Nuno Moniz, Pin-Yu Chen, Tian Gao, Werner Geyer, Xiangliang Zhang, Yue Huang, Yujun Zhou, Zhenwen Liang","submitted_at":"2026-02-12T16:13:14Z","abstract_excerpt":"While most AI alignment research focuses on preventing models from generating explicitly harmful content, a more subtle risk arises from capability-seeking RL training in vulnerable environments. We investigate whether language models, when trained with reinforcement learning (RL) in environments with implicit loopholes, can learn to exploit these flaws to maximize reward, even without being explicitly instructed to do so. To test this, we design a suite of four diverse \"vulnerability games,\" each presenting a structural vulnerability related to context-conditional compliance, proxy metrics, r"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.12124","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.12124/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-05T01:15:20Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"BVD+vmLQD97GBLIhDGoXCL+VQkaDG8Eg9xntecBz4DZNTMg+O/VGL1+VMgQPzwD/5Q6ewoIT85y4F1a/6DOYCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T21:16:49.340757Z"},"content_sha256":"3d71708576c4005abee799dee9b7be193e38aa031e2124b7f81c7ea6d51c3d3c","schema_version":"1.0","event_id":"sha256:3d71708576c4005abee799dee9b7be193e38aa031e2124b7f81c7ea6d51c3d3c"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/VCVBSE4H6XPFVBCOWYCCH2QHWC/bundle.json","state_url":"https://pith.science/pith/VCVBSE4H6XPFVBCOWYCCH2QHWC/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/VCVBSE4H6XPFVBCOWYCCH2QHWC/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-09T21:16:49Z","links":{"resolver":"https://pith.science/pith/VCVBSE4H6XPFVBCOWYCCH2QHWC","bundle":"https://pith.science/pith/VCVBSE4H6XPFVBCOWYCCH2QHWC/bundle.json","state":"https://pith.science/pith/VCVBSE4H6XPFVBCOWYCCH2QHWC/state.json","well_known_bundle":"https://pith.science/.well-known/pith/VCVBSE4H6XPFVBCOWYCCH2QHWC/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:VCVBSE4H6XPFVBCOWYCCH2QHWC","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"b2c69ef8a07714f021bb55f1621ed06d5f696738863c5ff8e85f32df2103ff14","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-12T16:13:14Z","title_canon_sha256":"7a91dc0f39ba595c81a38e52f34f8b419a11ddade4c9069b8da3597473cf1fd2"},"schema_version":"1.0","source":{"id":"2602.12124","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.12124","created_at":"2026-06-05T01:15:20Z"},{"alias_kind":"arxiv_version","alias_value":"2602.12124v2","created_at":"2026-06-05T01:15:20Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.12124","created_at":"2026-06-05T01:15:20Z"},{"alias_kind":"pith_short_12","alias_value":"VCVBSE4H6XPF","created_at":"2026-06-05T01:15:20Z"},{"alias_kind":"pith_short_16","alias_value":"VCVBSE4H6XPFVBCO","created_at":"2026-06-05T01:15:20Z"},{"alias_kind":"pith_short_8","alias_value":"VCVBSE4H","created_at":"2026-06-05T01:15:20Z"}],"graph_snapshots":[{"event_id":"sha256:3d71708576c4005abee799dee9b7be193e38aa031e2124b7f81c7ea6d51c3d3c","target":"graph","created_at":"2026-06-05T01:15:20Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2602.12124/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"While most AI alignment research focuses on preventing models from generating explicitly harmful content, a more subtle risk arises from capability-seeking RL training in vulnerable environments. We investigate whether language models, when trained with reinforcement learning (RL) in environments with implicit loopholes, can learn to exploit these flaws to maximize reward, even without being explicitly instructed to do so. To test this, we design a suite of four diverse \"vulnerability games,\" each presenting a structural vulnerability related to context-conditional compliance, proxy metrics, r","authors_text":"Han Bao, Kehan Guo, Nitesh V Chawla, Nuno Moniz, Pin-Yu Chen, Tian Gao, Werner Geyer, Xiangliang Zhang, Yue Huang, Yujun Zhou, Zhenwen Liang","cross_cats":["cs.CL"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-12T16:13:14Z","title":"Alignment Risks from Capability-Seeking RL Training"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.12124","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:3b454f32153310babb929c045e3078d4d0c381cf99ac5ef5e126610320a34774","target":"record","created_at":"2026-06-05T01:15:20Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"b2c69ef8a07714f021bb55f1621ed06d5f696738863c5ff8e85f32df2103ff14","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-12T16:13:14Z","title_canon_sha256":"7a91dc0f39ba595c81a38e52f34f8b419a11ddade4c9069b8da3597473cf1fd2"},"schema_version":"1.0","source":{"id":"2602.12124","kind":"arxiv","version":2}},"canonical_sha256":"a8aa191387f5de5a844eb60423ea07b082ae46d44a6ae5cc22be3012de181cba","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"a8aa191387f5de5a844eb60423ea07b082ae46d44a6ae5cc22be3012de181cba","first_computed_at":"2026-06-05T01:15:20.152776Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-05T01:15:20.152776Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"/Q0aywJlPQvHsf6uk8SsjE68BhBaoOjs8UwjVtUJUP43/jAS6LAOwP8Mo3ghb0FIPrmGxQZQrNxgNyRaFskmAw==","signature_status":"signed_v1","signed_at":"2026-06-05T01:15:20.153292Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.12124","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:3b454f32153310babb929c045e3078d4d0c381cf99ac5ef5e126610320a34774","sha256:3d71708576c4005abee799dee9b7be193e38aa031e2124b7f81c7ea6d51c3d3c"],"state_sha256":"6f56c3b4b71d2c52254126aee3b3df6e2c6ea5c26a866e32a49ee8d9d3d4ed16"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cJmYKgz5jnic5Tv/KLYfv9bzNUCILrPmWKwFzup8oiGyOzzdrgHOI9Z/Kveuv7+eOQa6piNYySUO4QNOSUeUCg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-09T21:16:49.344640Z","bundle_sha256":"494fbea2d7fc590394737ae8de4bfe9c121b57dab79c5cfeb430879b0b1147d5"}}