{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:2Y5C4DL6PAZY4EYIRJ3M436ME4","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5999aef1ee86eaf70f9cf0035c14dbdc6f1970b9529e05624f4cf96364068314","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T10:19:19Z","title_canon_sha256":"327da6f86814c6f71f62bd764413f8dc9a6036e19e7f1918754756fa3cd60b41"},"schema_version":"1.0","source":{"id":"2605.14664","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14664","created_at":"2026-05-17T23:39:02Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14664v1","created_at":"2026-05-17T23:39:02Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14664","created_at":"2026-05-17T23:39:02Z"},{"alias_kind":"pith_short_12","alias_value":"2Y5C4DL6PAZY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"2Y5C4DL6PAZY4EYI","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"2Y5C4DL6","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:141be6735743f2968085f35ecb5ec8f807e7513d909382917fe25c2fbf98af49","target":"graph","created_at":"2026-05-17T23:39:02Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experiments demonstrate that MiVE achieves state-of-the-art performance by ranking highest in human preference, outperforming both academic methods and commercial systems."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"VLM layers encode complementary information hierarchically -- early layers capture localized spatial details essential for precise editing, while deeper layers encode global semantics for instruction comprehension."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MiVE repurposes VLMs as multiscale feature extractors integrated into a unified self-attention Diffusion Transformer, achieving top human preference in reference-guided video editing."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"MiVE pulls multiscale features from a single vision-language model to guide accurate reference-based video edits."}],"snapshot_sha256":"92d605ebcdfa646fa449facea177c9b6fa83ce9f227dcef248e072f872b203c2"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"593fd33e42c26674ba2507b358353c6eb1e3c1fbad57963139b2ce21b8c80320"},"paper":{"abstract_excerpt":"Reference-guided video editing takes a source video, a text instruction, and a reference image as inputs, requiring the model to faithfully apply the instructed edits while preserving original motion and unedited content. Existing methods fall into two paradigms, each with inherent limitations: decoupled encoders suffer from modality gaps when processing instructions and visual content independently, while unified vision-language encoders lose fine-grained spatial details by relying solely on final-layer representations. We observe that VLM layers encode complementary information hierarchicall","authors_text":"Chengjing Wu, Luoqi Liu, Meng Zou, Ting Liu, Tong Wang, Xiaochao Qu, Xiaolin Hu","cross_cats":[],"headline":"MiVE pulls multiscale features from a single vision-language model to guide accurate reference-based video edits.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T10:19:19Z","title":"MiVE: Multiscale Vision-language features for reference-guided video Editing"},"references":{"count":34,"internal_anchors":9,"resolved_work":34,"sample":[{"cited_arxiv_id":"2503.07598","doi":"10.48550/arxiv.2503.07598","is_internal_anchor":true,"ref_index":1,"title":"VACE: All-in-One Video Creation and Editing","work_id":"c68efbde-3431-4655-a337-87e2871ad6a3","year":2025},{"cited_arxiv_id":"","doi":"10.1145/3721238.3730673","is_internal_anchor":false,"ref_index":2,"title":"VideoPainter: Any-length Video Inpainting and Editing with Plug-and-Play Context Control , booktitle =","work_id":"78615cd4-c278-4978-a0f6-d3afbc8233f2","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"2025 , url =","work_id":"2fc4524e-07d0-4e9e-80a7-763c7eafe6b5","year":2025},{"cited_arxiv_id":"","doi":"10.48550/arxiv.2512.02933","is_internal_anchor":false,"ref_index":4,"title":"CoRR , volume =","work_id":"a99c19f6-9776-4fd6-ad91-9f6268407f2e","year":2025},{"cited_arxiv_id":"2512.07469","doi":"10.48550/arxiv.2512.07469","is_internal_anchor":true,"ref_index":5,"title":"VideoCoF: Unified Video Editing with Temporal Reasoner","work_id":"41eadf73-66bd-4058-9dc2-d9d37bc8f31b","year":2025}],"snapshot_sha256":"de5cf2273ec99fe5e306df8f9d17b2ea6c9955f2bf3593a0ca4b0b4a826763c7"},"source":{"id":"2605.14664","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T05:19:37.778019Z","id":"6cc176bd-ca3d-4252-a9f8-09686389c125","model_set":{"reader":"grok-4.3"},"one_line_summary":"MiVE repurposes VLMs as multiscale feature extractors integrated into a unified self-attention Diffusion Transformer, achieving top human preference in reference-guided video editing.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"MiVE pulls multiscale features from a single vision-language model to guide accurate reference-based video edits.","strongest_claim":"Experiments demonstrate that MiVE achieves state-of-the-art performance by ranking highest in human preference, outperforming both academic methods and commercial systems.","weakest_assumption":"VLM layers encode complementary information hierarchically -- early layers capture localized spatial details essential for precise editing, while deeper layers encode global semantics for instruction comprehension."}},"verdict_id":"6cc176bd-ca3d-4252-a9f8-09686389c125"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:b4a4b48944be0937759021ab14e5b1c390317a7f17fddb87d712404b1f1f3cf3","target":"record","created_at":"2026-05-17T23:39:02Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5999aef1ee86eaf70f9cf0035c14dbdc6f1970b9529e05624f4cf96364068314","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T10:19:19Z","title_canon_sha256":"327da6f86814c6f71f62bd764413f8dc9a6036e19e7f1918754756fa3cd60b41"},"schema_version":"1.0","source":{"id":"2605.14664","kind":"arxiv","version":1}},"canonical_sha256":"d63a2e0d7e78338e13088a76ce6fcc27059ec6f3051ee4ab8722391e4dc5b822","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"d63a2e0d7e78338e13088a76ce6fcc27059ec6f3051ee4ab8722391e4dc5b822","first_computed_at":"2026-05-17T23:39:02.164730Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:02.164730Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"hSJwxRmjtWek3J9EUTi8ZAvzZlgyLRmAgqnllOe+oz8LPU/7CitMM1+i9eHOT8k+iCdLBdNQWQWrcWSmATpqAQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:02.165302Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14664","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:b4a4b48944be0937759021ab14e5b1c390317a7f17fddb87d712404b1f1f3cf3","sha256:141be6735743f2968085f35ecb5ec8f807e7513d909382917fe25c2fbf98af49"],"state_sha256":"a91055807500398a4e4d0e9c1cd5041588e4e319f12cd6c87d5654eb0737bb06"}