{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:O6KD46D5N4AYG3WDJMRUPETDND","short_pith_number":"pith:O6KD46D5","canonical_record":{"source":{"id":"2507.01352","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-07-02T04:40:29Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"6f033713f4cab0f8de5ba8bf4b556e999b6b3850ba518e493c47fec4a93cd745","abstract_canon_sha256":"eed245eae4f5619efa15a39473bb27c79db95695756a3e819b98bcf16f934774"},"schema_version":"1.0"},"canonical_sha256":"77943e787d6f01836ec34b2347926368c1e8d65c863c19e419cb78384dbde901","source":{"kind":"arxiv","id":"2507.01352","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2507.01352","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"arxiv_version","alias_value":"2507.01352v3","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2507.01352","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"pith_short_12","alias_value":"O6KD46D5N4AY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"O6KD46D5N4AYG3WD","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"O6KD46D5","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:O6KD46D5N4AYG3WDJMRUPETDND","target":"record","payload":{"canonical_record":{"source":{"id":"2507.01352","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-07-02T04:40:29Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"6f033713f4cab0f8de5ba8bf4b556e999b6b3850ba518e493c47fec4a93cd745","abstract_canon_sha256":"eed245eae4f5619efa15a39473bb27c79db95695756a3e819b98bcf16f934774"},"schema_version":"1.0"},"canonical_sha256":"77943e787d6f01836ec34b2347926368c1e8d65c863c19e419cb78384dbde901","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:46.414238Z","signature_b64":"5FLtFz7X9nonuDgoiFFjgHVkm3Vuig40Fq9VPtamdr3JmY1X003kPK4qHQSvuhkWYs1bf52xlCZdOIqC5UmDAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"77943e787d6f01836ec34b2347926368c1e8d65c863c19e419cb78384dbde901","last_reissued_at":"2026-05-17T23:38:46.413654Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:46.413654Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2507.01352","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"7aSrzQ+3+nl3UglKJEkU1eTopzb5yB0CkUsAm8klb22XmfDWYCIs2ThzUnUmtKiBFY1vHb8ZqevzEl6w3s+JCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T23:18:25.644224Z"},"content_sha256":"7856ee217565fae4ed7a0c09ca6bf232cc921931f44d4642df3286978ca7d250","schema_version":"1.0","event_id":"sha256:7856ee217565fae4ed7a0c09ca6bf232cc921931f44d4642df3286978ca7d250"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:O6KD46D5N4AYG3WDJMRUPETDND","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Skywork-Reward-V2: Scaling Preference Data Curation via Human-AI Synergy","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Human-AI synergy curates 40 million preference pairs to train state-of-the-art reward models.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Chaojie Wang, Chris Yuhao Liu, Fuxiang Zhang, Jiacai Liu, Jiacheng Xu, Jujie He, Liang Zeng, Rui Yan, Wei Shen, Yahui Zhou, Yang Liu, Yuzhen Xiao","submitted_at":"2025-07-02T04:40:29Z","abstract_excerpt":"Despite the critical role of reward models (RMs) in Reinforcement Learning from Human Feedback (RLHF), current state-of-the-art open RMs perform poorly on most existing evaluation benchmarks, failing to capture nuanced human preferences. We hypothesize that this brittleness stems primarily from limitations in preference datasets, which are often narrowly scoped, synthetically labeled, or lack rigorous quality control. To address these challenges, we present SynPref-40M, a large-scale preference dataset comprising 40 million preference pairs. To enable data curation at scale, we design a human-"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Skywork-Reward-V2 models achieve state-of-the-art performance across seven major reward model benchmarks, outperform generative reward models, and demonstrate strong downstream performance.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The brittleness of current reward models stems primarily from limitations in preference datasets, and the human-AI synergistic pipeline produces measurably higher-quality data that directly causes the reported benchmark gains.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Skywork-Reward-V2 models trained on 26 million human-AI curated preference pairs set new state-of-the-art results on seven major reward model benchmarks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Human-AI synergy curates 40 million preference pairs to train state-of-the-art reward models.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"99b168e6256d8384e58df7bf6887432d8156cff65909acd93761ad687a084c27"},"source":{"id":"2507.01352","kind":"arxiv","version":3},"verdict":{"id":"80bb961c-ece1-4f6d-bbb6-d119aab4c401","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T22:13:47.494891Z","strongest_claim":"Skywork-Reward-V2 models achieve state-of-the-art performance across seven major reward model benchmarks, outperform generative reward models, and demonstrate strong downstream performance.","one_line_summary":"Skywork-Reward-V2 models trained on 26 million human-AI curated preference pairs set new state-of-the-art results on seven major reward model benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The brittleness of current reward models stems primarily from limitations in preference datasets, and the human-AI synergistic pipeline produces measurably higher-quality data that directly causes the reported benchmark gains.","pith_extraction_headline":"Human-AI synergy curates 40 million preference pairs to train state-of-the-art reward models."},"references":{"count":13,"sample":[{"doi":"","year":2023,"title":"Most BT-based models fall under the sequence classifier category, while generative models primarily include LLM-as-a-Judge approaches","work_id":"112ea89c-bff6-4bbb-bfef-157919436666","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"This stratification identifies objective/low-controversial versus subjective/high- controversial regions, where intransitivity is more common","work_id":"843045db-9b8b-47d0-8b30-a624baed982e","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Error-driven adaptive retrieval focuses on “unstable” regions.In Stage 1, we repeatedly train an RM, evaluate it on human-verified gold data, and use error-driven adaptive retrieval to pull in new exa","work_id":"bc2f160c-bde8-43f8-93e1-af92ec24f101","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Stage 2 dual-RM consistency filtering targets contradictory signals.Stage 2 introduces a consistency filter: we train a gold RM on cumulative human-verified samples and use it together with the Stage-","work_id":"5f8ba03c-8b02-45b9-942b-57e2c763faa3","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Human annotators may not be experts in all types of math and coding problems","work_id":"d72fb68a-3e6b-484f-8884-df2369714391","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":13,"snapshot_sha256":"f3ae2e6a6817d99eb2b25ae3606bec9c875e44c35f925c22882207ae6f7c6400","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f281721ff41d90b9b9cf489b4666903c8c846284d674d1eb3a0ccaeed1e8de96"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"80bb961c-ece1-4f6d-bbb6-d119aab4c401"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"i7Ze8cjN0g3ZXKkdffEUnS55iGH9GPHcF13YS2IV0L7XhRXGQcfGCsPY3pn77kAkJc+q8C/N4QFwiwQ0UsyMDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T23:18:25.645426Z"},"content_sha256":"7c2f6911bb60d9ef9973497b525bafb6e82586ce00025a095f59ff5ced41850a","schema_version":"1.0","event_id":"sha256:7c2f6911bb60d9ef9973497b525bafb6e82586ce00025a095f59ff5ced41850a"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/O6KD46D5N4AYG3WDJMRUPETDND/bundle.json","state_url":"https://pith.science/pith/O6KD46D5N4AYG3WDJMRUPETDND/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/O6KD46D5N4AYG3WDJMRUPETDND/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T23:18:25Z","links":{"resolver":"https://pith.science/pith/O6KD46D5N4AYG3WDJMRUPETDND","bundle":"https://pith.science/pith/O6KD46D5N4AYG3WDJMRUPETDND/bundle.json","state":"https://pith.science/pith/O6KD46D5N4AYG3WDJMRUPETDND/state.json","well_known_bundle":"https://pith.science/.well-known/pith/O6KD46D5N4AYG3WDJMRUPETDND/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:O6KD46D5N4AYG3WDJMRUPETDND","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"eed245eae4f5619efa15a39473bb27c79db95695756a3e819b98bcf16f934774","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-07-02T04:40:29Z","title_canon_sha256":"6f033713f4cab0f8de5ba8bf4b556e999b6b3850ba518e493c47fec4a93cd745"},"schema_version":"1.0","source":{"id":"2507.01352","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2507.01352","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"arxiv_version","alias_value":"2507.01352v3","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2507.01352","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"pith_short_12","alias_value":"O6KD46D5N4AY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"O6KD46D5N4AYG3WD","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"O6KD46D5","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:7c2f6911bb60d9ef9973497b525bafb6e82586ce00025a095f59ff5ced41850a","target":"graph","created_at":"2026-05-17T23:38:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Skywork-Reward-V2 models achieve state-of-the-art performance across seven major reward model benchmarks, outperform generative reward models, and demonstrate strong downstream performance."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The brittleness of current reward models stems primarily from limitations in preference datasets, and the human-AI synergistic pipeline produces measurably higher-quality data that directly causes the reported benchmark gains."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Skywork-Reward-V2 models trained on 26 million human-AI curated preference pairs set new state-of-the-art results on seven major reward model benchmarks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Human-AI synergy curates 40 million preference pairs to train state-of-the-art reward models."}],"snapshot_sha256":"99b168e6256d8384e58df7bf6887432d8156cff65909acd93761ad687a084c27"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f281721ff41d90b9b9cf489b4666903c8c846284d674d1eb3a0ccaeed1e8de96"},"paper":{"abstract_excerpt":"Despite the critical role of reward models (RMs) in Reinforcement Learning from Human Feedback (RLHF), current state-of-the-art open RMs perform poorly on most existing evaluation benchmarks, failing to capture nuanced human preferences. We hypothesize that this brittleness stems primarily from limitations in preference datasets, which are often narrowly scoped, synthetically labeled, or lack rigorous quality control. To address these challenges, we present SynPref-40M, a large-scale preference dataset comprising 40 million preference pairs. To enable data curation at scale, we design a human-","authors_text":"Chaojie Wang, Chris Yuhao Liu, Fuxiang Zhang, Jiacai Liu, Jiacheng Xu, Jujie He, Liang Zeng, Rui Yan, Wei Shen, Yahui Zhou, Yang Liu, Yuzhen Xiao","cross_cats":["cs.AI","cs.LG"],"headline":"Human-AI synergy curates 40 million preference pairs to train state-of-the-art reward models.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-07-02T04:40:29Z","title":"Skywork-Reward-V2: Scaling Preference Data Curation via Human-AI Synergy"},"references":{"count":13,"internal_anchors":0,"resolved_work":13,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Most BT-based models fall under the sequence classifier category, while generative models primarily include LLM-as-a-Judge approaches","work_id":"112ea89c-bff6-4bbb-bfef-157919436666","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"This stratification identifies objective/low-controversial versus subjective/high- controversial regions, where intransitivity is more common","work_id":"843045db-9b8b-47d0-8b30-a624baed982e","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Error-driven adaptive retrieval focuses on “unstable” regions.In Stage 1, we repeatedly train an RM, evaluate it on human-verified gold data, and use error-driven adaptive retrieval to pull in new exa","work_id":"bc2f160c-bde8-43f8-93e1-af92ec24f101","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Stage 2 dual-RM consistency filtering targets contradictory signals.Stage 2 introduces a consistency filter: we train a gold RM on cumulative human-verified samples and use it together with the Stage-","work_id":"5f8ba03c-8b02-45b9-942b-57e2c763faa3","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Human annotators may not be experts in all types of math and coding problems","work_id":"d72fb68a-3e6b-484f-8884-df2369714391","year":null}],"snapshot_sha256":"f3ae2e6a6817d99eb2b25ae3606bec9c875e44c35f925c22882207ae6f7c6400"},"source":{"id":"2507.01352","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-16T22:13:47.494891Z","id":"80bb961c-ece1-4f6d-bbb6-d119aab4c401","model_set":{"reader":"grok-4.3"},"one_line_summary":"Skywork-Reward-V2 models trained on 26 million human-AI curated preference pairs set new state-of-the-art results on seven major reward model benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Human-AI synergy curates 40 million preference pairs to train state-of-the-art reward models.","strongest_claim":"Skywork-Reward-V2 models achieve state-of-the-art performance across seven major reward model benchmarks, outperform generative reward models, and demonstrate strong downstream performance.","weakest_assumption":"The brittleness of current reward models stems primarily from limitations in preference datasets, and the human-AI synergistic pipeline produces measurably higher-quality data that directly causes the reported benchmark gains."}},"verdict_id":"80bb961c-ece1-4f6d-bbb6-d119aab4c401"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:7856ee217565fae4ed7a0c09ca6bf232cc921931f44d4642df3286978ca7d250","target":"record","created_at":"2026-05-17T23:38:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"eed245eae4f5619efa15a39473bb27c79db95695756a3e819b98bcf16f934774","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-07-02T04:40:29Z","title_canon_sha256":"6f033713f4cab0f8de5ba8bf4b556e999b6b3850ba518e493c47fec4a93cd745"},"schema_version":"1.0","source":{"id":"2507.01352","kind":"arxiv","version":3}},"canonical_sha256":"77943e787d6f01836ec34b2347926368c1e8d65c863c19e419cb78384dbde901","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"77943e787d6f01836ec34b2347926368c1e8d65c863c19e419cb78384dbde901","first_computed_at":"2026-05-17T23:38:46.413654Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:46.413654Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"5FLtFz7X9nonuDgoiFFjgHVkm3Vuig40Fq9VPtamdr3JmY1X003kPK4qHQSvuhkWYs1bf52xlCZdOIqC5UmDAw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:46.414238Z","signed_message":"canonical_sha256_bytes"},"source_id":"2507.01352","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:7856ee217565fae4ed7a0c09ca6bf232cc921931f44d4642df3286978ca7d250","sha256:7c2f6911bb60d9ef9973497b525bafb6e82586ce00025a095f59ff5ced41850a"],"state_sha256":"158087c0f8683fa6b764ff590880dc857b03897af84e0de6f6aea364ce02dd23"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"y0KrvBEeUewLmZ52oxxGbctiTt0mwIQwA2SejWe7g4wZeVdLrWgCd7KBGSUB40FCVoW5M3Q2eEVwW93MMfjXCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T23:18:25.652168Z","bundle_sha256":"1dd90da6b45390f4f639826847c19b1b45dbbab85eb5f513a6ad9628c938980e"}}