{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:BF74DONPSGUCOZO7LS72TNI6K7","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"55c3bf34555b8875db5f726bc5145bffc2d568879754103ab651d21ba35d2ae5","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-06-01T02:06:58Z","title_canon_sha256":"2a0f349417c4302181136015e7509da38419bd38636d3fab59ffbbfec4f82f71"},"schema_version":"1.0","source":{"id":"2606.01561","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.01561","created_at":"2026-06-02T02:04:36Z"},{"alias_kind":"arxiv_version","alias_value":"2606.01561v1","created_at":"2026-06-02T02:04:36Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.01561","created_at":"2026-06-02T02:04:36Z"},{"alias_kind":"pith_short_12","alias_value":"BF74DONPSGUC","created_at":"2026-06-02T02:04:36Z"},{"alias_kind":"pith_short_16","alias_value":"BF74DONPSGUCOZO7","created_at":"2026-06-02T02:04:36Z"},{"alias_kind":"pith_short_8","alias_value":"BF74DONP","created_at":"2026-06-02T02:04:36Z"}],"graph_snapshots":[{"event_id":"sha256:95a373dc5540e501dd11b35ee3910994e175e5f94bfde812c5997ad920962257","target":"graph","created_at":"2026-06-02T02:04:36Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.01561/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Aligning Large Language Models (LLMs) with human preferences is often formulated via Direct Preference Optimization (DPO). However, the standard Bradley-Terry instantiation of DPO is limited in modeling common departures from transitivity in human preferences. To address this, recent work has introduced Self-Play Preference Optimization (SPPO), which iteratively refines the policy by training on self-generated win-lose pairs. Our investigation, however, reveals a critical instability in SPPO: the optimization is prone to policy degeneration when the preference oracle assigns overly confident w","authors_text":"Abolfazl Razi, Anderson Schneider, Brendan Hogan Rappazzo, Feng Luo, Huayu Li, Jingjing Wang, Mingkun Xu, Peijie Qiu, Prayag Tiwari, Wenhui Zhu, Xiwen Chen, Xuanzhao Dong, Yujian Xiong, Yuriy Nevmyvaka, ZhengXiao He, Zhipeng Wang","cross_cats":["cs.LG"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-06-01T02:06:58Z","title":"S-SPPO: Semantic-Calibrated Self-Play Preference Optimization"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.01561","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:484b240dbc03976811ae21acac10303cb1c41d08ac19d4afcb5367f4a01001ee","target":"record","created_at":"2026-06-02T02:04:36Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"55c3bf34555b8875db5f726bc5145bffc2d568879754103ab651d21ba35d2ae5","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-06-01T02:06:58Z","title_canon_sha256":"2a0f349417c4302181136015e7509da38419bd38636d3fab59ffbbfec4f82f71"},"schema_version":"1.0","source":{"id":"2606.01561","kind":"arxiv","version":1}},"canonical_sha256":"097fc1b9af91a82765df5cbfa9b51e57def2a355acd802d11bce5c907e77264d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"097fc1b9af91a82765df5cbfa9b51e57def2a355acd802d11bce5c907e77264d","first_computed_at":"2026-06-02T02:04:36.640958Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-02T02:04:36.640958Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"QJBfd9M1HbeK4vsytINuCgxOCPp4hFBa7sZNBtHev54EK4w6v6vdBDuXUufrLvWIo72BCpuyhpMKlUBR6DNZDw==","signature_status":"signed_v1","signed_at":"2026-06-02T02:04:36.641396Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.01561","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:484b240dbc03976811ae21acac10303cb1c41d08ac19d4afcb5367f4a01001ee","sha256:95a373dc5540e501dd11b35ee3910994e175e5f94bfde812c5997ad920962257"],"state_sha256":"2e6ded5a0b6113d0f8e2b882954230c4c64d8c118e3061bf3a5eeb0252cfcd7a"}