{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:HFQBZVIDLX4CUCQZJBDV3CIAEL","short_pith_number":"pith:HFQBZVID","canonical_record":{"source":{"id":"2601.04068","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-01-07T16:32:17Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"88155bb746b5acc291ca04674b6ac289e3d0a10639fd4dfc314d8efafefd3ca0","abstract_canon_sha256":"43878163f4f400605643cda096dfa433a4547fb1f1f7f6d08d8bd90c7630978b"},"schema_version":"1.0"},"canonical_sha256":"39601cd5035df82a0a1948475d890022d1a5caa7e4fe1cfe2ac2c83965b3c8b4","source":{"kind":"arxiv","id":"2601.04068","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.04068","created_at":"2026-05-21T01:04:20Z"},{"alias_kind":"arxiv_version","alias_value":"2601.04068v4","created_at":"2026-05-21T01:04:20Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.04068","created_at":"2026-05-21T01:04:20Z"},{"alias_kind":"pith_short_12","alias_value":"HFQBZVIDLX4C","created_at":"2026-05-21T01:04:20Z"},{"alias_kind":"pith_short_16","alias_value":"HFQBZVIDLX4CUCQZ","created_at":"2026-05-21T01:04:20Z"},{"alias_kind":"pith_short_8","alias_value":"HFQBZVID","created_at":"2026-05-21T01:04:20Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:HFQBZVIDLX4CUCQZJBDV3CIAEL","target":"record","payload":{"canonical_record":{"source":{"id":"2601.04068","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-01-07T16:32:17Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"88155bb746b5acc291ca04674b6ac289e3d0a10639fd4dfc314d8efafefd3ca0","abstract_canon_sha256":"43878163f4f400605643cda096dfa433a4547fb1f1f7f6d08d8bd90c7630978b"},"schema_version":"1.0"},"canonical_sha256":"39601cd5035df82a0a1948475d890022d1a5caa7e4fe1cfe2ac2c83965b3c8b4","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T01:04:20.655666Z","signature_b64":"SX4Gt+V9a9zVhw+xrGjjSAq5awuFe0FKhCdcdEkcxiXDPFz1LBu9HXcOobW2ESly0UYKKpkcngSfBwyz9Bi/DQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"39601cd5035df82a0a1948475d890022d1a5caa7e4fe1cfe2ac2c83965b3c8b4","last_reissued_at":"2026-05-21T01:04:20.654858Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T01:04:20.654858Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2601.04068","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-21T01:04:20Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"BHB6K4Ja/An52IEuOFpuIGtePA0hdz2CfRK7+tr01WreBBqxkEaL9gvhxJxlsVC4rcRjB5ZtARUN2pjAP+kJAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T02:14:46.005796Z"},"content_sha256":"1d6eff0db4121aae3eb73448f1d94e1fafd2f5145f580ad6f040c28d91c1812e","schema_version":"1.0","event_id":"sha256:1d6eff0db4121aae3eb73448f1d94e1fafd2f5145f580ad6f040c28d91c1812e"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:HFQBZVIDLX4CUCQZJBDV3CIAEL","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Mind the Generative Details: Direct Localized Detail Preference Optimization for Video Diffusion Models","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"LocalDPO aligns text-to-video diffusion models by optimizing preferences only on locally corrupted regions of real videos.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Chao Gao, Kaidong Zhang, Rui Ding, Wangmeng Zuo, Ying Chen, Yukang Ding, Zitong Huang","submitted_at":"2026-01-07T16:32:17Z","abstract_excerpt":"Aligning text-to-video diffusion models with human preferences is crucial for generating high-quality videos. Existing Direct Preference Otimization (DPO) methods rely on multi-sample ranking and task-specific critic models, which is inefficient and often yields ambiguous global supervision. To address these limitations, we propose LocalDPO, a novel post-training framework that constructs localized preference pairs from real videos and optimizes alignment at the spatio-temporal region level. We design an automated pipeline to efficiently collect preference pair data that generates preference p"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experiments on Wan2.1 and CogVideoX demonstrate that LocalDPO consistently improves video fidelity, temporal coherence and human preference scores over other post-training approaches, establishing a more efficient and fine-grained paradigm for video generator alignment.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That videos created by locally masking real footage and inpainting only the masked regions with the frozen base model produce negatives whose flaws correspond to the kinds of errors humans actually dislike at the region level.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"LocalDPO creates localized preference pairs from real videos by applying random spatio-temporal masks and restoring masked regions with the frozen base model, then applies region-restricted DPO loss to improve fidelity and coherence in video diffusion models.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"LocalDPO aligns text-to-video diffusion models by optimizing preferences only on locally corrupted regions of real videos.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"60c902450b7184a50d91626d4b808c1f89e5332595253034078ea76a10196f6c"},"source":{"id":"2601.04068","kind":"arxiv","version":4},"verdict":{"id":"b2ddb26c-0edf-4b26-957f-43d53edaa3a9","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T16:21:19.499664Z","strongest_claim":"Experiments on Wan2.1 and CogVideoX demonstrate that LocalDPO consistently improves video fidelity, temporal coherence and human preference scores over other post-training approaches, establishing a more efficient and fine-grained paradigm for video generator alignment.","one_line_summary":"LocalDPO creates localized preference pairs from real videos by applying random spatio-temporal masks and restoring masked regions with the frozen base model, then applies region-restricted DPO loss to improve fidelity and coherence in video diffusion models.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That videos created by locally masking real footage and inpainting only the masked regions with the frozen base model produce negatives whose flaws correspond to the kinds of errors humans actually dislike at the region level.","pith_extraction_headline":"LocalDPO aligns text-to-video diffusion models by optimizing preferences only on locally corrupted regions of real videos."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.04068/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":1,"snapshot_sha256":"e79d6776da67ec689bdf78b8854f8051b865969be2b761f476b2987a51897fe7"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"b2ddb26c-0edf-4b26-957f-43d53edaa3a9"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-21T01:04:20Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"iGAbTcjyKqi82zkGHDuHXMFlE4Dp4FqjDKwrRaZu4XAnh6GC8isx5sZHlpYmnsMdYTtf5PF8+zZgLdwR4m6oAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T02:14:46.006340Z"},"content_sha256":"46dde328ec98d3cee770a2262b273e7760e2bc03f5b69babcb612eaa3dec9ab1","schema_version":"1.0","event_id":"sha256:46dde328ec98d3cee770a2262b273e7760e2bc03f5b69babcb612eaa3dec9ab1"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/HFQBZVIDLX4CUCQZJBDV3CIAEL/bundle.json","state_url":"https://pith.science/pith/HFQBZVIDLX4CUCQZJBDV3CIAEL/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/HFQBZVIDLX4CUCQZJBDV3CIAEL/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T02:14:46Z","links":{"resolver":"https://pith.science/pith/HFQBZVIDLX4CUCQZJBDV3CIAEL","bundle":"https://pith.science/pith/HFQBZVIDLX4CUCQZJBDV3CIAEL/bundle.json","state":"https://pith.science/pith/HFQBZVIDLX4CUCQZJBDV3CIAEL/state.json","well_known_bundle":"https://pith.science/.well-known/pith/HFQBZVIDLX4CUCQZJBDV3CIAEL/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:HFQBZVIDLX4CUCQZJBDV3CIAEL","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"43878163f4f400605643cda096dfa433a4547fb1f1f7f6d08d8bd90c7630978b","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-01-07T16:32:17Z","title_canon_sha256":"88155bb746b5acc291ca04674b6ac289e3d0a10639fd4dfc314d8efafefd3ca0"},"schema_version":"1.0","source":{"id":"2601.04068","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.04068","created_at":"2026-05-21T01:04:20Z"},{"alias_kind":"arxiv_version","alias_value":"2601.04068v4","created_at":"2026-05-21T01:04:20Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.04068","created_at":"2026-05-21T01:04:20Z"},{"alias_kind":"pith_short_12","alias_value":"HFQBZVIDLX4C","created_at":"2026-05-21T01:04:20Z"},{"alias_kind":"pith_short_16","alias_value":"HFQBZVIDLX4CUCQZ","created_at":"2026-05-21T01:04:20Z"},{"alias_kind":"pith_short_8","alias_value":"HFQBZVID","created_at":"2026-05-21T01:04:20Z"}],"graph_snapshots":[{"event_id":"sha256:46dde328ec98d3cee770a2262b273e7760e2bc03f5b69babcb612eaa3dec9ab1","target":"graph","created_at":"2026-05-21T01:04:20Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experiments on Wan2.1 and CogVideoX demonstrate that LocalDPO consistently improves video fidelity, temporal coherence and human preference scores over other post-training approaches, establishing a more efficient and fine-grained paradigm for video generator alignment."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That videos created by locally masking real footage and inpainting only the masked regions with the frozen base model produce negatives whose flaws correspond to the kinds of errors humans actually dislike at the region level."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"LocalDPO creates localized preference pairs from real videos by applying random spatio-temporal masks and restoring masked regions with the frozen base model, then applies region-restricted DPO loss to improve fidelity and coherence in video diffusion models."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"LocalDPO aligns text-to-video diffusion models by optimizing preferences only on locally corrupted regions of real videos."}],"snapshot_sha256":"60c902450b7184a50d91626d4b808c1f89e5332595253034078ea76a10196f6c"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"e79d6776da67ec689bdf78b8854f8051b865969be2b761f476b2987a51897fe7"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2601.04068/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Aligning text-to-video diffusion models with human preferences is crucial for generating high-quality videos. Existing Direct Preference Otimization (DPO) methods rely on multi-sample ranking and task-specific critic models, which is inefficient and often yields ambiguous global supervision. To address these limitations, we propose LocalDPO, a novel post-training framework that constructs localized preference pairs from real videos and optimizes alignment at the spatio-temporal region level. We design an automated pipeline to efficiently collect preference pair data that generates preference p","authors_text":"Chao Gao, Kaidong Zhang, Rui Ding, Wangmeng Zuo, Ying Chen, Yukang Ding, Zitong Huang","cross_cats":["cs.AI"],"headline":"LocalDPO aligns text-to-video diffusion models by optimizing preferences only on locally corrupted regions of real videos.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-01-07T16:32:17Z","title":"Mind the Generative Details: Direct Localized Detail Preference Optimization for Video Diffusion Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.04068","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-16T16:21:19.499664Z","id":"b2ddb26c-0edf-4b26-957f-43d53edaa3a9","model_set":{"reader":"grok-4.3"},"one_line_summary":"LocalDPO creates localized preference pairs from real videos by applying random spatio-temporal masks and restoring masked regions with the frozen base model, then applies region-restricted DPO loss to improve fidelity and coherence in video diffusion models.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"LocalDPO aligns text-to-video diffusion models by optimizing preferences only on locally corrupted regions of real videos.","strongest_claim":"Experiments on Wan2.1 and CogVideoX demonstrate that LocalDPO consistently improves video fidelity, temporal coherence and human preference scores over other post-training approaches, establishing a more efficient and fine-grained paradigm for video generator alignment.","weakest_assumption":"That videos created by locally masking real footage and inpainting only the masked regions with the frozen base model produce negatives whose flaws correspond to the kinds of errors humans actually dislike at the region level."}},"verdict_id":"b2ddb26c-0edf-4b26-957f-43d53edaa3a9"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:1d6eff0db4121aae3eb73448f1d94e1fafd2f5145f580ad6f040c28d91c1812e","target":"record","created_at":"2026-05-21T01:04:20Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"43878163f4f400605643cda096dfa433a4547fb1f1f7f6d08d8bd90c7630978b","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-01-07T16:32:17Z","title_canon_sha256":"88155bb746b5acc291ca04674b6ac289e3d0a10639fd4dfc314d8efafefd3ca0"},"schema_version":"1.0","source":{"id":"2601.04068","kind":"arxiv","version":4}},"canonical_sha256":"39601cd5035df82a0a1948475d890022d1a5caa7e4fe1cfe2ac2c83965b3c8b4","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"39601cd5035df82a0a1948475d890022d1a5caa7e4fe1cfe2ac2c83965b3c8b4","first_computed_at":"2026-05-21T01:04:20.654858Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-21T01:04:20.654858Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"SX4Gt+V9a9zVhw+xrGjjSAq5awuFe0FKhCdcdEkcxiXDPFz1LBu9HXcOobW2ESly0UYKKpkcngSfBwyz9Bi/DQ==","signature_status":"signed_v1","signed_at":"2026-05-21T01:04:20.655666Z","signed_message":"canonical_sha256_bytes"},"source_id":"2601.04068","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:1d6eff0db4121aae3eb73448f1d94e1fafd2f5145f580ad6f040c28d91c1812e","sha256:46dde328ec98d3cee770a2262b273e7760e2bc03f5b69babcb612eaa3dec9ab1"],"state_sha256":"6e2fc4a8781c3550dc0832cda64ed2d7175f9add20c62d5b303009e6c15140b6"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"S+Q/SMst6YA5j9lNEI17jxXzc5hwoJx4/pqFbJG/Ggrf+1wBjppZ3GkBS590ywMChfUGowlQcYjTtVjUNGp6Dw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T02:14:46.008870Z","bundle_sha256":"f1f2e7d1722f46ef677eab696bc3a43159d1256e00d82411e2f31c2826d0d04f"}}