{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:CIVC5APGZ5JW37FC2DUAY5F5RU","short_pith_number":"pith:CIVC5APG","canonical_record":{"source":{"id":"2605.27355","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-26T17:57:04Z","cross_cats_sorted":["cs.CL","cs.LG"],"title_canon_sha256":"56b9f89b9f1235b5778dbc465ed416de7871f15bc136d27a70362336af018935","abstract_canon_sha256":"4f5762fb4eaaeaf4bc4cc86649356cb239d41759085752e0bc080a48842dc824"},"schema_version":"1.0"},"canonical_sha256":"122a2e81e6cf536dfca2d0e80c74bd8d27b5a4ff94a956551df2d4da3180c668","source":{"kind":"arxiv","id":"2605.27355","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.27355","created_at":"2026-05-27T02:06:19Z"},{"alias_kind":"arxiv_version","alias_value":"2605.27355v1","created_at":"2026-05-27T02:06:19Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.27355","created_at":"2026-05-27T02:06:19Z"},{"alias_kind":"pith_short_12","alias_value":"CIVC5APGZ5JW","created_at":"2026-05-27T02:06:19Z"},{"alias_kind":"pith_short_16","alias_value":"CIVC5APGZ5JW37FC","created_at":"2026-05-27T02:06:19Z"},{"alias_kind":"pith_short_8","alias_value":"CIVC5APG","created_at":"2026-05-27T02:06:19Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:CIVC5APGZ5JW37FC2DUAY5F5RU","target":"record","payload":{"canonical_record":{"source":{"id":"2605.27355","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-26T17:57:04Z","cross_cats_sorted":["cs.CL","cs.LG"],"title_canon_sha256":"56b9f89b9f1235b5778dbc465ed416de7871f15bc136d27a70362336af018935","abstract_canon_sha256":"4f5762fb4eaaeaf4bc4cc86649356cb239d41759085752e0bc080a48842dc824"},"schema_version":"1.0"},"canonical_sha256":"122a2e81e6cf536dfca2d0e80c74bd8d27b5a4ff94a956551df2d4da3180c668","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T02:06:19.498290Z","signature_b64":"FMjiqhiMph8ffr8XRLCYAsOpNPv4e41OK4vPetMaLFejzoGYDic50rEjVcF4T41zj6aGpYCvYJx0noIT6nj8Bw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"122a2e81e6cf536dfca2d0e80c74bd8d27b5a4ff94a956551df2d4da3180c668","last_reissued_at":"2026-05-27T02:06:19.497326Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T02:06:19.497326Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.27355","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T02:06:19Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ug/Gj9LuwT3m/JXwiGwDZuOTWESB/6LjrV7qoIhX1Yc/YOHupONcJcwHlCRuRK6SrpNO5WqJRuAKnaFMkpK1CA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T07:29:33.324234Z"},"content_sha256":"9f2887cfab5d0264d1dd938f7eaa2b05e0f6a52915dd795b3f69d51dcc3055af","schema_version":"1.0","event_id":"sha256:9f2887cfab5d0264d1dd938f7eaa2b05e0f6a52915dd795b3f69d51dcc3055af"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:CIVC5APGZ5JW37FC2DUAY5F5RU","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Alignment Tampering: How Reinforcement Learning from Human Feedback Is Exploited to Optimize Misaligned Biases","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.LG"],"primary_cat":"cs.AI","authors_text":"Dongyoon Hahm, Dylan Hadfield-Menell, Kimin Lee","submitted_at":"2026-05-26T17:57:04Z","abstract_excerpt":"Reinforcement Learning from Human Feedback (RLHF) is the standard method to align Large Language Models (LLMs) with human preferences. In this work, we introduce alignment tampering, a potential vulnerability where the LLM undergoing alignment influences the preference dataset, causing RLHF to amplify undesired behaviors. This arises from core limitations of RLHF: (1) preference datasets are constructed from the LLM's own outputs, allowing it to influence them, and (2) pairwise comparisons only indicate which response is better, not why. These limitations can be exploited to cause alignment ta"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.27355","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.27355/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T02:06:19Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"0IJ2j96Tm16VB+jyEkecQ1+7uClsjVW1iZAYPqDuHdUwe4qm7lM4IqJOvH15bNKNn2WrZP6yY2Wkfk0Ho79OBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T07:29:33.325038Z"},"content_sha256":"79f5460f59f0a755c1ade650332c044e45c7e308646c646bac7d7829de64d0de","schema_version":"1.0","event_id":"sha256:79f5460f59f0a755c1ade650332c044e45c7e308646c646bac7d7829de64d0de"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/CIVC5APGZ5JW37FC2DUAY5F5RU/bundle.json","state_url":"https://pith.science/pith/CIVC5APGZ5JW37FC2DUAY5F5RU/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/CIVC5APGZ5JW37FC2DUAY5F5RU/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-09T07:29:33Z","links":{"resolver":"https://pith.science/pith/CIVC5APGZ5JW37FC2DUAY5F5RU","bundle":"https://pith.science/pith/CIVC5APGZ5JW37FC2DUAY5F5RU/bundle.json","state":"https://pith.science/pith/CIVC5APGZ5JW37FC2DUAY5F5RU/state.json","well_known_bundle":"https://pith.science/.well-known/pith/CIVC5APGZ5JW37FC2DUAY5F5RU/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:CIVC5APGZ5JW37FC2DUAY5F5RU","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"4f5762fb4eaaeaf4bc4cc86649356cb239d41759085752e0bc080a48842dc824","cross_cats_sorted":["cs.CL","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-26T17:57:04Z","title_canon_sha256":"56b9f89b9f1235b5778dbc465ed416de7871f15bc136d27a70362336af018935"},"schema_version":"1.0","source":{"id":"2605.27355","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.27355","created_at":"2026-05-27T02:06:19Z"},{"alias_kind":"arxiv_version","alias_value":"2605.27355v1","created_at":"2026-05-27T02:06:19Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.27355","created_at":"2026-05-27T02:06:19Z"},{"alias_kind":"pith_short_12","alias_value":"CIVC5APGZ5JW","created_at":"2026-05-27T02:06:19Z"},{"alias_kind":"pith_short_16","alias_value":"CIVC5APGZ5JW37FC","created_at":"2026-05-27T02:06:19Z"},{"alias_kind":"pith_short_8","alias_value":"CIVC5APG","created_at":"2026-05-27T02:06:19Z"}],"graph_snapshots":[{"event_id":"sha256:79f5460f59f0a755c1ade650332c044e45c7e308646c646bac7d7829de64d0de","target":"graph","created_at":"2026-05-27T02:06:19Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.27355/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Reinforcement Learning from Human Feedback (RLHF) is the standard method to align Large Language Models (LLMs) with human preferences. In this work, we introduce alignment tampering, a potential vulnerability where the LLM undergoing alignment influences the preference dataset, causing RLHF to amplify undesired behaviors. This arises from core limitations of RLHF: (1) preference datasets are constructed from the LLM's own outputs, allowing it to influence them, and (2) pairwise comparisons only indicate which response is better, not why. These limitations can be exploited to cause alignment ta","authors_text":"Dongyoon Hahm, Dylan Hadfield-Menell, Kimin Lee","cross_cats":["cs.CL","cs.LG"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-26T17:57:04Z","title":"Alignment Tampering: How Reinforcement Learning from Human Feedback Is Exploited to Optimize Misaligned Biases"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.27355","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:9f2887cfab5d0264d1dd938f7eaa2b05e0f6a52915dd795b3f69d51dcc3055af","target":"record","created_at":"2026-05-27T02:06:19Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"4f5762fb4eaaeaf4bc4cc86649356cb239d41759085752e0bc080a48842dc824","cross_cats_sorted":["cs.CL","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-26T17:57:04Z","title_canon_sha256":"56b9f89b9f1235b5778dbc465ed416de7871f15bc136d27a70362336af018935"},"schema_version":"1.0","source":{"id":"2605.27355","kind":"arxiv","version":1}},"canonical_sha256":"122a2e81e6cf536dfca2d0e80c74bd8d27b5a4ff94a956551df2d4da3180c668","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"122a2e81e6cf536dfca2d0e80c74bd8d27b5a4ff94a956551df2d4da3180c668","first_computed_at":"2026-05-27T02:06:19.497326Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-27T02:06:19.497326Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"FMjiqhiMph8ffr8XRLCYAsOpNPv4e41OK4vPetMaLFejzoGYDic50rEjVcF4T41zj6aGpYCvYJx0noIT6nj8Bw==","signature_status":"signed_v1","signed_at":"2026-05-27T02:06:19.498290Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.27355","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:9f2887cfab5d0264d1dd938f7eaa2b05e0f6a52915dd795b3f69d51dcc3055af","sha256:79f5460f59f0a755c1ade650332c044e45c7e308646c646bac7d7829de64d0de"],"state_sha256":"91b9d81510fcdd3d9103ecc75bb04a2999ab3c230440932e89eee02a0783d015"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"9ABDlXjMw8kHj+43UkIhecjXrnXjXPhZ6J+Ec3+iZZHSlj7Qd+7nnbfashI12soswLYvFDQj2JjuohO+1H5MDg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-09T07:29:33.328959Z","bundle_sha256":"952c21386744932b06f7bcef770e91b43c4759f6a7c8d8e23a4a2a4efddd9c4b"}}