{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:AN42PZ3SINI53U4YQ4ZFO7LW7Y","short_pith_number":"pith:AN42PZ3S","canonical_record":{"source":{"id":"2412.21059","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-30T16:24:09Z","cross_cats_sorted":[],"title_canon_sha256":"39a0666368477f74ebf28eeba6072b7039948ddac20aa00d38326e055b7ff834","abstract_canon_sha256":"c2ed7078fe147ba9a4e48afb64c94fad27a9c834e9d5e7fb1c6dac3523162af1"},"schema_version":"1.0"},"canonical_sha256":"0379a7e7724351ddd3988732577d76fe1eaf1925f7e11dc39feaae3aab0dcf6e","source":{"kind":"arxiv","id":"2412.21059","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2412.21059","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2412.21059v4","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.21059","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"AN42PZ3SINI5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"AN42PZ3SINI53U4Y","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"AN42PZ3S","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:AN42PZ3SINI53U4YQ4ZFO7LW7Y","target":"record","payload":{"canonical_record":{"source":{"id":"2412.21059","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-30T16:24:09Z","cross_cats_sorted":[],"title_canon_sha256":"39a0666368477f74ebf28eeba6072b7039948ddac20aa00d38326e055b7ff834","abstract_canon_sha256":"c2ed7078fe147ba9a4e48afb64c94fad27a9c834e9d5e7fb1c6dac3523162af1"},"schema_version":"1.0"},"canonical_sha256":"0379a7e7724351ddd3988732577d76fe1eaf1925f7e11dc39feaae3aab0dcf6e","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:47.977934Z","signature_b64":"jJnaKccMNtqsdl9z3sHcOVCRDqzKdVQ2OMI5o714XkrBlRsCOMxqM5eY61gZWe5rr+1l4meZsIpKUOFK2ef4CA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0379a7e7724351ddd3988732577d76fe1eaf1925f7e11dc39feaae3aab0dcf6e","last_reissued_at":"2026-05-17T23:38:47.977331Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:47.977331Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2412.21059","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"hSZHTrPhZjRicgcgMiROrL1b7TQ2GnF6iBkrxXCicB7Iv8ZL6yuJxNXfR/rOc1NBhBtyYIYUXas2H63onetkCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T06:22:30.986254Z"},"content_sha256":"bd734eaed563f7db2b5676c054986a9eece90ae7d9c12042295d459f651330fe","schema_version":"1.0","event_id":"sha256:bd734eaed563f7db2b5676c054986a9eece90ae7d9c12042295d459f651330fe"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:AN42PZ3SINI53U4YQ4ZFO7LW7Y","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"VisionReward: Fine-Grained Multi-Dimensional Human Preference Learning for Image and Video Generation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"VisionReward learns fine-grained human preferences for image and video generation through hierarchical assessment and linear weighting.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Dan Zhang, Jiajun Xu, Jiale Cheng, Jiayan Teng, Jiazheng Xu, Jie Tang, Ming Ding, Minlie Huang, Qunlin Jin, Shen Yang, Shiyu Huang, Shurun Li, Wenbo Duan, Wendi Zheng, Xiaohan Zhang, Xiao Liu, Xiaotao Gu, Yuanming Yang, Yuan Wang, Yu Huang, Yuxiao Dong, Zhuoyi Yang","submitted_at":"2024-12-30T16:24:09Z","abstract_excerpt":"Visual generative models have achieved remarkable progress in synthesizing photorealistic images and videos, yet aligning their outputs with human preferences across critical dimensions remains a persistent challenge. Though reinforcement learning from human feedback offers promise for preference alignment, existing reward models for visual generation face limitations, including black-box scoring without interpretability and potentially resultant unexpected biases. We present VisionReward, a general framework for learning human visual preferences in both image and video generation. Specificall"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"VisionReward surpasses VideoScore by 17.2% in preference prediction accuracy, and text-to-video models with VisionReward achieve a 31.6% higher pairwise win rate compared to the same models using VideoScore.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the proposed hierarchical visual assessment framework combined with linear weighting accurately captures fine-grained human preferences across dimensions without introducing unexpected biases or inconsistencies.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"VisionReward learns multi-dimensional human preferences for image and video generation via hierarchical assessment and linear weighting, outperforming VideoScore by 17.2% in prediction accuracy and yielding 31.6% higher win rates in text-to-video models.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"VisionReward learns fine-grained human preferences for image and video generation through hierarchical assessment and linear weighting.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"a6acc44e6ef17cf5d40bda7ebcdec77902beefc570af05847143e06e66edd25c"},"source":{"id":"2412.21059","kind":"arxiv","version":4},"verdict":{"id":"ad95b6c9-6a77-441b-a0b3-9867e161a911","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T11:45:06.312296Z","strongest_claim":"VisionReward surpasses VideoScore by 17.2% in preference prediction accuracy, and text-to-video models with VisionReward achieve a 31.6% higher pairwise win rate compared to the same models using VideoScore.","one_line_summary":"VisionReward learns multi-dimensional human preferences for image and video generation via hierarchical assessment and linear weighting, outperforming VideoScore by 17.2% in prediction accuracy and yielding 31.6% higher win rates in text-to-video models.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the proposed hierarchical visual assessment framework combined with linear weighting accurately captures fine-grained human preferences across dimensions without introducing unexpected biases or inconsistencies.","pith_extraction_headline":"VisionReward learns fine-grained human preferences for image and video generation through hierarchical assessment and linear weighting."},"references":{"count":59,"sample":[{"doi":"","year":2021,"title":"International conference on machine learning , pages=","work_id":"a096dd73-9e87-4fea-b180-fc5175415c79","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Advances in Neural Information Processing Systems , volume=","work_id":"49bfdf94-b4a7-4662-b0c6-c9081230ee76","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Advances in neural information processing systems , volume=","work_id":"0aa0ce1a-9941-46aa-ad9b-5d1b6aed53ea","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) , pages=","work_id":"c54cc136-9872-46fd-bc11-87faa6e167ea","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis","work_id":"8034c587-fba6-4941-87ba-c98f2ac962cb","ref_index":5,"cited_arxiv_id":"2307.01952","is_internal_anchor":true}],"resolved_work":59,"snapshot_sha256":"a530bf0479498cc5008bbdfeb2a33b1049203194189d4487b4908a3365bb6ce1","internal_anchors":14},"formal_canon":{"evidence_count":2,"snapshot_sha256":"7c161c9097056dcb00d6dcf2cdb63589e84aa1dbe74df31471714826c013e276"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"ad95b6c9-6a77-441b-a0b3-9867e161a911"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"J3Y2kpDQBdq3o7RyGebWizf5yI2AdqOr+JqGdniJIh7chxwe78ij7DdbUx9CKAN5R+Aey7r50/drR1W2x5o8Cg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T06:22:30.987120Z"},"content_sha256":"8ab52fc8aebb08301f0be0c9d6ec25e62e19d210e96680059c36732ac71309e4","schema_version":"1.0","event_id":"sha256:8ab52fc8aebb08301f0be0c9d6ec25e62e19d210e96680059c36732ac71309e4"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/AN42PZ3SINI53U4YQ4ZFO7LW7Y/bundle.json","state_url":"https://pith.science/pith/AN42PZ3SINI53U4YQ4ZFO7LW7Y/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/AN42PZ3SINI53U4YQ4ZFO7LW7Y/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-05T06:22:30Z","links":{"resolver":"https://pith.science/pith/AN42PZ3SINI53U4YQ4ZFO7LW7Y","bundle":"https://pith.science/pith/AN42PZ3SINI53U4YQ4ZFO7LW7Y/bundle.json","state":"https://pith.science/pith/AN42PZ3SINI53U4YQ4ZFO7LW7Y/state.json","well_known_bundle":"https://pith.science/.well-known/pith/AN42PZ3SINI53U4YQ4ZFO7LW7Y/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:AN42PZ3SINI53U4YQ4ZFO7LW7Y","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"c2ed7078fe147ba9a4e48afb64c94fad27a9c834e9d5e7fb1c6dac3523162af1","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-30T16:24:09Z","title_canon_sha256":"39a0666368477f74ebf28eeba6072b7039948ddac20aa00d38326e055b7ff834"},"schema_version":"1.0","source":{"id":"2412.21059","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2412.21059","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2412.21059v4","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.21059","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"AN42PZ3SINI5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"AN42PZ3SINI53U4Y","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"AN42PZ3S","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8ab52fc8aebb08301f0be0c9d6ec25e62e19d210e96680059c36732ac71309e4","target":"graph","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"VisionReward surpasses VideoScore by 17.2% in preference prediction accuracy, and text-to-video models with VisionReward achieve a 31.6% higher pairwise win rate compared to the same models using VideoScore."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the proposed hierarchical visual assessment framework combined with linear weighting accurately captures fine-grained human preferences across dimensions without introducing unexpected biases or inconsistencies."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"VisionReward learns multi-dimensional human preferences for image and video generation via hierarchical assessment and linear weighting, outperforming VideoScore by 17.2% in prediction accuracy and yielding 31.6% higher win rates in text-to-video models."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"VisionReward learns fine-grained human preferences for image and video generation through hierarchical assessment and linear weighting."}],"snapshot_sha256":"a6acc44e6ef17cf5d40bda7ebcdec77902beefc570af05847143e06e66edd25c"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"7c161c9097056dcb00d6dcf2cdb63589e84aa1dbe74df31471714826c013e276"},"paper":{"abstract_excerpt":"Visual generative models have achieved remarkable progress in synthesizing photorealistic images and videos, yet aligning their outputs with human preferences across critical dimensions remains a persistent challenge. Though reinforcement learning from human feedback offers promise for preference alignment, existing reward models for visual generation face limitations, including black-box scoring without interpretability and potentially resultant unexpected biases. We present VisionReward, a general framework for learning human visual preferences in both image and video generation. Specificall","authors_text":"Dan Zhang, Jiajun Xu, Jiale Cheng, Jiayan Teng, Jiazheng Xu, Jie Tang, Ming Ding, Minlie Huang, Qunlin Jin, Shen Yang, Shiyu Huang, Shurun Li, Wenbo Duan, Wendi Zheng, Xiaohan Zhang, Xiao Liu, Xiaotao Gu, Yuanming Yang, Yuan Wang, Yu Huang, Yuxiao Dong, Zhuoyi Yang","cross_cats":[],"headline":"VisionReward learns fine-grained human preferences for image and video generation through hierarchical assessment and linear weighting.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-30T16:24:09Z","title":"VisionReward: Fine-Grained Multi-Dimensional Human Preference Learning for Image and Video Generation"},"references":{"count":59,"internal_anchors":14,"resolved_work":59,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"International conference on machine learning , pages=","work_id":"a096dd73-9e87-4fea-b180-fc5175415c79","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Advances in Neural Information Processing Systems , volume=","work_id":"49bfdf94-b4a7-4662-b0c6-c9081230ee76","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Advances in neural information processing systems , volume=","work_id":"0aa0ce1a-9941-46aa-ad9b-5d1b6aed53ea","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) , pages=","work_id":"c54cc136-9872-46fd-bc11-87faa6e167ea","year":2022},{"cited_arxiv_id":"2307.01952","doi":"","is_internal_anchor":true,"ref_index":5,"title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis","work_id":"8034c587-fba6-4941-87ba-c98f2ac962cb","year":null}],"snapshot_sha256":"a530bf0479498cc5008bbdfeb2a33b1049203194189d4487b4908a3365bb6ce1"},"source":{"id":"2412.21059","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-16T11:45:06.312296Z","id":"ad95b6c9-6a77-441b-a0b3-9867e161a911","model_set":{"reader":"grok-4.3"},"one_line_summary":"VisionReward learns multi-dimensional human preferences for image and video generation via hierarchical assessment and linear weighting, outperforming VideoScore by 17.2% in prediction accuracy and yielding 31.6% higher win rates in text-to-video models.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"VisionReward learns fine-grained human preferences for image and video generation through hierarchical assessment and linear weighting.","strongest_claim":"VisionReward surpasses VideoScore by 17.2% in preference prediction accuracy, and text-to-video models with VisionReward achieve a 31.6% higher pairwise win rate compared to the same models using VideoScore.","weakest_assumption":"That the proposed hierarchical visual assessment framework combined with linear weighting accurately captures fine-grained human preferences across dimensions without introducing unexpected biases or inconsistencies."}},"verdict_id":"ad95b6c9-6a77-441b-a0b3-9867e161a911"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:bd734eaed563f7db2b5676c054986a9eece90ae7d9c12042295d459f651330fe","target":"record","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"c2ed7078fe147ba9a4e48afb64c94fad27a9c834e9d5e7fb1c6dac3523162af1","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-30T16:24:09Z","title_canon_sha256":"39a0666368477f74ebf28eeba6072b7039948ddac20aa00d38326e055b7ff834"},"schema_version":"1.0","source":{"id":"2412.21059","kind":"arxiv","version":4}},"canonical_sha256":"0379a7e7724351ddd3988732577d76fe1eaf1925f7e11dc39feaae3aab0dcf6e","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0379a7e7724351ddd3988732577d76fe1eaf1925f7e11dc39feaae3aab0dcf6e","first_computed_at":"2026-05-17T23:38:47.977331Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:47.977331Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"jJnaKccMNtqsdl9z3sHcOVCRDqzKdVQ2OMI5o714XkrBlRsCOMxqM5eY61gZWe5rr+1l4meZsIpKUOFK2ef4CA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:47.977934Z","signed_message":"canonical_sha256_bytes"},"source_id":"2412.21059","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:bd734eaed563f7db2b5676c054986a9eece90ae7d9c12042295d459f651330fe","sha256:8ab52fc8aebb08301f0be0c9d6ec25e62e19d210e96680059c36732ac71309e4"],"state_sha256":"800cdb63b867d27a36badb47c366662078b1c64be56064777e886f3ade96f5e7"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"QoCUlUyOZt65xTQAMeZFso8HUSGoD963/+HBk5a5cF0+sOP3ZEYb4Hg3LbC4zOrzZtou0LX5O0GqlMZ+MgE9Cw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-05T06:22:30.991057Z","bundle_sha256":"fc68b11c4f5999d7ec66bc91922c22bf78c3325ea29dab342ff3aed76a148bf1"}}