{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:UPBSTUQS7QUCKPYSBPES3K3SJU","short_pith_number":"pith:UPBSTUQS","canonical_record":{"source":{"id":"2601.22664","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-01-30T07:32:35Z","cross_cats_sorted":[],"title_canon_sha256":"b76e9a8246bfc8df4ef9fc0ac6d38a3c7d7899eccbf75a74e41acc736fe158d8","abstract_canon_sha256":"01704cee8d62cf3c70ecf5cc1040d7603c9bdbde48118aadff54a4a0a9f68bc6"},"schema_version":"1.0"},"canonical_sha256":"a3c329d212fc28253f120bc92dab724d1bf4dad53cdb2224674ab2cd19ff4149","source":{"kind":"arxiv","id":"2601.22664","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.22664","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"arxiv_version","alias_value":"2601.22664v4","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.22664","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"pith_short_12","alias_value":"UPBSTUQS7QUC","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"pith_short_16","alias_value":"UPBSTUQS7QUCKPYS","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"pith_short_8","alias_value":"UPBSTUQS","created_at":"2026-05-20T00:03:03Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:UPBSTUQS7QUCKPYSBPES3K3SJU","target":"record","payload":{"canonical_record":{"source":{"id":"2601.22664","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-01-30T07:32:35Z","cross_cats_sorted":[],"title_canon_sha256":"b76e9a8246bfc8df4ef9fc0ac6d38a3c7d7899eccbf75a74e41acc736fe158d8","abstract_canon_sha256":"01704cee8d62cf3c70ecf5cc1040d7603c9bdbde48118aadff54a4a0a9f68bc6"},"schema_version":"1.0"},"canonical_sha256":"a3c329d212fc28253f120bc92dab724d1bf4dad53cdb2224674ab2cd19ff4149","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:03:03.859085Z","signature_b64":"/dNzsoOj5utUcB0cHNJgPrmgAQ6bBOHH/EGqsX1Ao2oREn9Xbg0L7I3Cz05Dvc+z5mHHRZ4snIkw98dZ5+Z0AQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a3c329d212fc28253f120bc92dab724d1bf4dad53cdb2224674ab2cd19ff4149","last_reissued_at":"2026-05-20T00:03:03.858109Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:03:03.858109Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2601.22664","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:03:03Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"n+0gc/wooWgAu6swvh7XOEj5nBkgs70iY8jlCoIY1HMN9mEMPA1iPdLBQV7/4bWMf3RDPKa2DQ8oXxMyZc0+BA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T04:14:41.598985Z"},"content_sha256":"3c58b5c22119fee5050b2d3555e3cfc28818a88ad63334f2adfefbf6611a7fca","schema_version":"1.0","event_id":"sha256:3c58b5c22119fee5050b2d3555e3cfc28818a88ad63334f2adfefbf6611a7fca"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:UPBSTUQS7QUCKPYSBPES3K3SJU","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Real-Time Aligned Reward Model beyond Semantics","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Deqing Wang, Fuzhen Zhuang, Hongyan Xie, Jianbin Zheng, Jianxin Li, Li Huaqiu, Songshi Liang, Xin Xia, Xuefeng Xiao, Yikun Ban, Yuxi Ren, Zhongxiang Dai, Zixuan Huang","submitted_at":"2026-01-30T07:32:35Z","abstract_excerpt":"Reinforcement Learning from Human Feedback (RLHF) is a pivotal technique for aligning large language models (LLMs) with human preferences, yet it is susceptible to reward overoptimization, in which policy models overfit to the reward model, exploit spurious reward patterns instead of faithfully capturing human intent. Prior mitigations primarily relies on surface semantic information and fails to efficiently address the misalignment between the reward model (RM) and the policy model caused by continuous policy distribution shifts. This inevitably leads to an increasing reward discrepancy, exac"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.22664","kind":"arxiv","version":4},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.22664/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:03:03Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"2DEXktVPw6i5JunPmU6kBRDi0AZvvahWy5IYaz3k7fHpSGj7tsedh7Dsrh7ib1unPOlXZDwcyj3Q1UXRYqmzCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T04:14:41.599661Z"},"content_sha256":"3fd8820d0b3e468465d0f01fc9ce57ece9fac8712617ba17c2dc6473537f4776","schema_version":"1.0","event_id":"sha256:3fd8820d0b3e468465d0f01fc9ce57ece9fac8712617ba17c2dc6473537f4776"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/UPBSTUQS7QUCKPYSBPES3K3SJU/bundle.json","state_url":"https://pith.science/pith/UPBSTUQS7QUCKPYSBPES3K3SJU/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/UPBSTUQS7QUCKPYSBPES3K3SJU/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T04:14:41Z","links":{"resolver":"https://pith.science/pith/UPBSTUQS7QUCKPYSBPES3K3SJU","bundle":"https://pith.science/pith/UPBSTUQS7QUCKPYSBPES3K3SJU/bundle.json","state":"https://pith.science/pith/UPBSTUQS7QUCKPYSBPES3K3SJU/state.json","well_known_bundle":"https://pith.science/.well-known/pith/UPBSTUQS7QUCKPYSBPES3K3SJU/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:UPBSTUQS7QUCKPYSBPES3K3SJU","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"01704cee8d62cf3c70ecf5cc1040d7603c9bdbde48118aadff54a4a0a9f68bc6","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-01-30T07:32:35Z","title_canon_sha256":"b76e9a8246bfc8df4ef9fc0ac6d38a3c7d7899eccbf75a74e41acc736fe158d8"},"schema_version":"1.0","source":{"id":"2601.22664","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.22664","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"arxiv_version","alias_value":"2601.22664v4","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.22664","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"pith_short_12","alias_value":"UPBSTUQS7QUC","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"pith_short_16","alias_value":"UPBSTUQS7QUCKPYS","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"pith_short_8","alias_value":"UPBSTUQS","created_at":"2026-05-20T00:03:03Z"}],"graph_snapshots":[{"event_id":"sha256:3fd8820d0b3e468465d0f01fc9ce57ece9fac8712617ba17c2dc6473537f4776","target":"graph","created_at":"2026-05-20T00:03:03Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2601.22664/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Reinforcement Learning from Human Feedback (RLHF) is a pivotal technique for aligning large language models (LLMs) with human preferences, yet it is susceptible to reward overoptimization, in which policy models overfit to the reward model, exploit spurious reward patterns instead of faithfully capturing human intent. Prior mitigations primarily relies on surface semantic information and fails to efficiently address the misalignment between the reward model (RM) and the policy model caused by continuous policy distribution shifts. This inevitably leads to an increasing reward discrepancy, exac","authors_text":"Deqing Wang, Fuzhen Zhuang, Hongyan Xie, Jianbin Zheng, Jianxin Li, Li Huaqiu, Songshi Liang, Xin Xia, Xuefeng Xiao, Yikun Ban, Yuxi Ren, Zhongxiang Dai, Zixuan Huang","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-01-30T07:32:35Z","title":"Real-Time Aligned Reward Model beyond Semantics"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.22664","kind":"arxiv","version":4},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:3c58b5c22119fee5050b2d3555e3cfc28818a88ad63334f2adfefbf6611a7fca","target":"record","created_at":"2026-05-20T00:03:03Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"01704cee8d62cf3c70ecf5cc1040d7603c9bdbde48118aadff54a4a0a9f68bc6","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-01-30T07:32:35Z","title_canon_sha256":"b76e9a8246bfc8df4ef9fc0ac6d38a3c7d7899eccbf75a74e41acc736fe158d8"},"schema_version":"1.0","source":{"id":"2601.22664","kind":"arxiv","version":4}},"canonical_sha256":"a3c329d212fc28253f120bc92dab724d1bf4dad53cdb2224674ab2cd19ff4149","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"a3c329d212fc28253f120bc92dab724d1bf4dad53cdb2224674ab2cd19ff4149","first_computed_at":"2026-05-20T00:03:03.858109Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:03:03.858109Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"/dNzsoOj5utUcB0cHNJgPrmgAQ6bBOHH/EGqsX1Ao2oREn9Xbg0L7I3Cz05Dvc+z5mHHRZ4snIkw98dZ5+Z0AQ==","signature_status":"signed_v1","signed_at":"2026-05-20T00:03:03.859085Z","signed_message":"canonical_sha256_bytes"},"source_id":"2601.22664","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:3c58b5c22119fee5050b2d3555e3cfc28818a88ad63334f2adfefbf6611a7fca","sha256:3fd8820d0b3e468465d0f01fc9ce57ece9fac8712617ba17c2dc6473537f4776"],"state_sha256":"dbe556aa2ac33ba02b02a1f9a9f8ccc45c4cd695bf06fc781e51bfb3a3e2220d"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"oeBGt97OReHp8LSGpRroJIWnOGu+b8jE4JyhqALltMQrtwR3204A45tlunNYtonY0cZrWY49UDynUnHg/OnLAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T04:14:41.602699Z","bundle_sha256":"7ec847e20632a638c688057fed6202f7186b434bef6647c23f9bbdae0e6efa99"}}