{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:2Y5C4DL6PAZY4EYIRJ3M436ME4","short_pith_number":"pith:2Y5C4DL6","canonical_record":{"source":{"id":"2605.14664","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T10:19:19Z","cross_cats_sorted":[],"title_canon_sha256":"327da6f86814c6f71f62bd764413f8dc9a6036e19e7f1918754756fa3cd60b41","abstract_canon_sha256":"5999aef1ee86eaf70f9cf0035c14dbdc6f1970b9529e05624f4cf96364068314"},"schema_version":"1.0"},"canonical_sha256":"d63a2e0d7e78338e13088a76ce6fcc27059ec6f3051ee4ab8722391e4dc5b822","source":{"kind":"arxiv","id":"2605.14664","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14664","created_at":"2026-05-17T23:39:02Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14664v1","created_at":"2026-05-17T23:39:02Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14664","created_at":"2026-05-17T23:39:02Z"},{"alias_kind":"pith_short_12","alias_value":"2Y5C4DL6PAZY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"2Y5C4DL6PAZY4EYI","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"2Y5C4DL6","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:2Y5C4DL6PAZY4EYIRJ3M436ME4","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14664","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T10:19:19Z","cross_cats_sorted":[],"title_canon_sha256":"327da6f86814c6f71f62bd764413f8dc9a6036e19e7f1918754756fa3cd60b41","abstract_canon_sha256":"5999aef1ee86eaf70f9cf0035c14dbdc6f1970b9529e05624f4cf96364068314"},"schema_version":"1.0"},"canonical_sha256":"d63a2e0d7e78338e13088a76ce6fcc27059ec6f3051ee4ab8722391e4dc5b822","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:02.165302Z","signature_b64":"hSJwxRmjtWek3J9EUTi8ZAvzZlgyLRmAgqnllOe+oz8LPU/7CitMM1+i9eHOT8k+iCdLBdNQWQWrcWSmATpqAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d63a2e0d7e78338e13088a76ce6fcc27059ec6f3051ee4ab8722391e4dc5b822","last_reissued_at":"2026-05-17T23:39:02.164730Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:02.164730Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14664","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:02Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"xCDeTdGY22hEx0z4HeL4mZ8P6xA4BkoiXF5vle11LVAxr6DYP9phgFCYBF/5jJOnzQ86fwOS8psqHEuVmf7/DQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T07:21:02.971165Z"},"content_sha256":"b4a4b48944be0937759021ab14e5b1c390317a7f17fddb87d712404b1f1f3cf3","schema_version":"1.0","event_id":"sha256:b4a4b48944be0937759021ab14e5b1c390317a7f17fddb87d712404b1f1f3cf3"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:2Y5C4DL6PAZY4EYIRJ3M436ME4","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"MiVE: Multiscale Vision-language features for reference-guided video Editing","license":"http://creativecommons.org/licenses/by/4.0/","headline":"MiVE pulls multiscale features from a single vision-language model to guide accurate reference-based video edits.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chengjing Wu, Luoqi Liu, Meng Zou, Ting Liu, Tong Wang, Xiaochao Qu, Xiaolin Hu","submitted_at":"2026-05-14T10:19:19Z","abstract_excerpt":"Reference-guided video editing takes a source video, a text instruction, and a reference image as inputs, requiring the model to faithfully apply the instructed edits while preserving original motion and unedited content. Existing methods fall into two paradigms, each with inherent limitations: decoupled encoders suffer from modality gaps when processing instructions and visual content independently, while unified vision-language encoders lose fine-grained spatial details by relying solely on final-layer representations. We observe that VLM layers encode complementary information hierarchicall"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experiments demonstrate that MiVE achieves state-of-the-art performance by ranking highest in human preference, outperforming both academic methods and commercial systems.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"VLM layers encode complementary information hierarchically -- early layers capture localized spatial details essential for precise editing, while deeper layers encode global semantics for instruction comprehension.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"MiVE repurposes VLMs as multiscale feature extractors integrated into a unified self-attention Diffusion Transformer, achieving top human preference in reference-guided video editing.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"MiVE pulls multiscale features from a single vision-language model to guide accurate reference-based video edits.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"92d605ebcdfa646fa449facea177c9b6fa83ce9f227dcef248e072f872b203c2"},"source":{"id":"2605.14664","kind":"arxiv","version":1},"verdict":{"id":"6cc176bd-ca3d-4252-a9f8-09686389c125","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T05:19:37.778019Z","strongest_claim":"Experiments demonstrate that MiVE achieves state-of-the-art performance by ranking highest in human preference, outperforming both academic methods and commercial systems.","one_line_summary":"MiVE repurposes VLMs as multiscale feature extractors integrated into a unified self-attention Diffusion Transformer, achieving top human preference in reference-guided video editing.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"VLM layers encode complementary information hierarchically -- early layers capture localized spatial details essential for precise editing, while deeper layers encode global semantics for instruction comprehension.","pith_extraction_headline":"MiVE pulls multiscale features from a single vision-language model to guide accurate reference-based video edits."},"references":{"count":34,"sample":[{"doi":"10.48550/arxiv.2503.07598","year":2025,"title":"VACE: All-in-One Video Creation and Editing","work_id":"c68efbde-3431-4655-a337-87e2871ad6a3","ref_index":1,"cited_arxiv_id":"2503.07598","is_internal_anchor":true},{"doi":"10.1145/3721238.3730673","year":2025,"title":"VideoPainter: Any-length Video Inpainting and Editing with Plug-and-Play Context Control , booktitle =","work_id":"78615cd4-c278-4978-a0f6-d3afbc8233f2","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"2025 , url =","work_id":"2fc4524e-07d0-4e9e-80a7-763c7eafe6b5","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.48550/arxiv.2512.02933","year":2025,"title":"CoRR , volume =","work_id":"a99c19f6-9776-4fd6-ad91-9f6268407f2e","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.48550/arxiv.2512.07469","year":2025,"title":"VideoCoF: Unified Video Editing with Temporal Reasoner","work_id":"41eadf73-66bd-4058-9dc2-d9d37bc8f31b","ref_index":5,"cited_arxiv_id":"2512.07469","is_internal_anchor":true}],"resolved_work":34,"snapshot_sha256":"de5cf2273ec99fe5e306df8f9d17b2ea6c9955f2bf3593a0ca4b0b4a826763c7","internal_anchors":9},"formal_canon":{"evidence_count":2,"snapshot_sha256":"593fd33e42c26674ba2507b358353c6eb1e3c1fbad57963139b2ce21b8c80320"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"6cc176bd-ca3d-4252-a9f8-09686389c125"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:02Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"BdSGfoAWilVS9yRMztKv3tCdv45vJfwkzo83DlgRcWYcolecDpUXkQi0scXLbveHJGAZQvtKQu5lsgYYi1C2CQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T07:21:02.972161Z"},"content_sha256":"141be6735743f2968085f35ecb5ec8f807e7513d909382917fe25c2fbf98af49","schema_version":"1.0","event_id":"sha256:141be6735743f2968085f35ecb5ec8f807e7513d909382917fe25c2fbf98af49"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/2Y5C4DL6PAZY4EYIRJ3M436ME4/bundle.json","state_url":"https://pith.science/pith/2Y5C4DL6PAZY4EYIRJ3M436ME4/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/2Y5C4DL6PAZY4EYIRJ3M436ME4/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T07:21:02Z","links":{"resolver":"https://pith.science/pith/2Y5C4DL6PAZY4EYIRJ3M436ME4","bundle":"https://pith.science/pith/2Y5C4DL6PAZY4EYIRJ3M436ME4/bundle.json","state":"https://pith.science/pith/2Y5C4DL6PAZY4EYIRJ3M436ME4/state.json","well_known_bundle":"https://pith.science/.well-known/pith/2Y5C4DL6PAZY4EYIRJ3M436ME4/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:2Y5C4DL6PAZY4EYIRJ3M436ME4","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5999aef1ee86eaf70f9cf0035c14dbdc6f1970b9529e05624f4cf96364068314","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T10:19:19Z","title_canon_sha256":"327da6f86814c6f71f62bd764413f8dc9a6036e19e7f1918754756fa3cd60b41"},"schema_version":"1.0","source":{"id":"2605.14664","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14664","created_at":"2026-05-17T23:39:02Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14664v1","created_at":"2026-05-17T23:39:02Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14664","created_at":"2026-05-17T23:39:02Z"},{"alias_kind":"pith_short_12","alias_value":"2Y5C4DL6PAZY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"2Y5C4DL6PAZY4EYI","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"2Y5C4DL6","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:141be6735743f2968085f35ecb5ec8f807e7513d909382917fe25c2fbf98af49","target":"graph","created_at":"2026-05-17T23:39:02Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experiments demonstrate that MiVE achieves state-of-the-art performance by ranking highest in human preference, outperforming both academic methods and commercial systems."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"VLM layers encode complementary information hierarchically -- early layers capture localized spatial details essential for precise editing, while deeper layers encode global semantics for instruction comprehension."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MiVE repurposes VLMs as multiscale feature extractors integrated into a unified self-attention Diffusion Transformer, achieving top human preference in reference-guided video editing."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"MiVE pulls multiscale features from a single vision-language model to guide accurate reference-based video edits."}],"snapshot_sha256":"92d605ebcdfa646fa449facea177c9b6fa83ce9f227dcef248e072f872b203c2"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"593fd33e42c26674ba2507b358353c6eb1e3c1fbad57963139b2ce21b8c80320"},"paper":{"abstract_excerpt":"Reference-guided video editing takes a source video, a text instruction, and a reference image as inputs, requiring the model to faithfully apply the instructed edits while preserving original motion and unedited content. Existing methods fall into two paradigms, each with inherent limitations: decoupled encoders suffer from modality gaps when processing instructions and visual content independently, while unified vision-language encoders lose fine-grained spatial details by relying solely on final-layer representations. We observe that VLM layers encode complementary information hierarchicall","authors_text":"Chengjing Wu, Luoqi Liu, Meng Zou, Ting Liu, Tong Wang, Xiaochao Qu, Xiaolin Hu","cross_cats":[],"headline":"MiVE pulls multiscale features from a single vision-language model to guide accurate reference-based video edits.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T10:19:19Z","title":"MiVE: Multiscale Vision-language features for reference-guided video Editing"},"references":{"count":34,"internal_anchors":9,"resolved_work":34,"sample":[{"cited_arxiv_id":"2503.07598","doi":"10.48550/arxiv.2503.07598","is_internal_anchor":true,"ref_index":1,"title":"VACE: All-in-One Video Creation and Editing","work_id":"c68efbde-3431-4655-a337-87e2871ad6a3","year":2025},{"cited_arxiv_id":"","doi":"10.1145/3721238.3730673","is_internal_anchor":false,"ref_index":2,"title":"VideoPainter: Any-length Video Inpainting and Editing with Plug-and-Play Context Control , booktitle =","work_id":"78615cd4-c278-4978-a0f6-d3afbc8233f2","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"2025 , url =","work_id":"2fc4524e-07d0-4e9e-80a7-763c7eafe6b5","year":2025},{"cited_arxiv_id":"","doi":"10.48550/arxiv.2512.02933","is_internal_anchor":false,"ref_index":4,"title":"CoRR , volume =","work_id":"a99c19f6-9776-4fd6-ad91-9f6268407f2e","year":2025},{"cited_arxiv_id":"2512.07469","doi":"10.48550/arxiv.2512.07469","is_internal_anchor":true,"ref_index":5,"title":"VideoCoF: Unified Video Editing with Temporal Reasoner","work_id":"41eadf73-66bd-4058-9dc2-d9d37bc8f31b","year":2025}],"snapshot_sha256":"de5cf2273ec99fe5e306df8f9d17b2ea6c9955f2bf3593a0ca4b0b4a826763c7"},"source":{"id":"2605.14664","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T05:19:37.778019Z","id":"6cc176bd-ca3d-4252-a9f8-09686389c125","model_set":{"reader":"grok-4.3"},"one_line_summary":"MiVE repurposes VLMs as multiscale feature extractors integrated into a unified self-attention Diffusion Transformer, achieving top human preference in reference-guided video editing.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"MiVE pulls multiscale features from a single vision-language model to guide accurate reference-based video edits.","strongest_claim":"Experiments demonstrate that MiVE achieves state-of-the-art performance by ranking highest in human preference, outperforming both academic methods and commercial systems.","weakest_assumption":"VLM layers encode complementary information hierarchically -- early layers capture localized spatial details essential for precise editing, while deeper layers encode global semantics for instruction comprehension."}},"verdict_id":"6cc176bd-ca3d-4252-a9f8-09686389c125"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:b4a4b48944be0937759021ab14e5b1c390317a7f17fddb87d712404b1f1f3cf3","target":"record","created_at":"2026-05-17T23:39:02Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5999aef1ee86eaf70f9cf0035c14dbdc6f1970b9529e05624f4cf96364068314","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T10:19:19Z","title_canon_sha256":"327da6f86814c6f71f62bd764413f8dc9a6036e19e7f1918754756fa3cd60b41"},"schema_version":"1.0","source":{"id":"2605.14664","kind":"arxiv","version":1}},"canonical_sha256":"d63a2e0d7e78338e13088a76ce6fcc27059ec6f3051ee4ab8722391e4dc5b822","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"d63a2e0d7e78338e13088a76ce6fcc27059ec6f3051ee4ab8722391e4dc5b822","first_computed_at":"2026-05-17T23:39:02.164730Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:02.164730Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"hSJwxRmjtWek3J9EUTi8ZAvzZlgyLRmAgqnllOe+oz8LPU/7CitMM1+i9eHOT8k+iCdLBdNQWQWrcWSmATpqAQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:02.165302Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14664","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:b4a4b48944be0937759021ab14e5b1c390317a7f17fddb87d712404b1f1f3cf3","sha256:141be6735743f2968085f35ecb5ec8f807e7513d909382917fe25c2fbf98af49"],"state_sha256":"a91055807500398a4e4d0e9c1cd5041588e4e319f12cd6c87d5654eb0737bb06"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"+YYIq2kuPnNKvtqoJ90VMy4TZVDUHwxmofJqBhjHDnwpAf3bwFh71apqPe3mRMA06u/nmTyR/EB0RBQMmrASCQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T07:21:02.976548Z","bundle_sha256":"34c29afddd0bcee5df72cac3d3c1337293a808cfa4e71e15d1a5a569859ba1d2"}}