{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:5PAHF7TLEYWWSYVPUV2ZLNVO24","short_pith_number":"pith:5PAHF7TL","canonical_record":{"source":{"id":"2603.02175","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-02T18:46:28Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"e82812b8063cd874a10adc3deb3298719b67b3b54e2993ac6ab820884bcb1180","abstract_canon_sha256":"1d4d9c9f52287d7737edd4900dfb6e570d716a7a4a526f3592fcdcb24500696d"},"schema_version":"1.0"},"canonical_sha256":"ebc072fe6b262d6962afa57595b6aed732be15f207cb133df498536b061d0300","source":{"kind":"arxiv","id":"2603.02175","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.02175","created_at":"2026-05-18T03:09:23Z"},{"alias_kind":"arxiv_version","alias_value":"2603.02175v4","created_at":"2026-05-18T03:09:23Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.02175","created_at":"2026-05-18T03:09:23Z"},{"alias_kind":"pith_short_12","alias_value":"5PAHF7TLEYWW","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"5PAHF7TLEYWWSYVP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"5PAHF7TL","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:5PAHF7TLEYWWSYVPUV2ZLNVO24","target":"record","payload":{"canonical_record":{"source":{"id":"2603.02175","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-02T18:46:28Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"e82812b8063cd874a10adc3deb3298719b67b3b54e2993ac6ab820884bcb1180","abstract_canon_sha256":"1d4d9c9f52287d7737edd4900dfb6e570d716a7a4a526f3592fcdcb24500696d"},"schema_version":"1.0"},"canonical_sha256":"ebc072fe6b262d6962afa57595b6aed732be15f207cb133df498536b061d0300","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:09:23.220217Z","signature_b64":"xz2r4KhtQNeBxFY4DuWN+shrryLYwgdVUQ8/ucs59h++d0G+brjbwTjlERllAYJ/aTjg4RoTpWDpFB/ZUbU0Bw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ebc072fe6b262d6962afa57595b6aed732be15f207cb133df498536b061d0300","last_reissued_at":"2026-05-18T03:09:23.219439Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:09:23.219439Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2603.02175","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:23Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ib1dKporp9finwKv4ZC2LECq743GwKewiBW816Mb2XtSdGk+qzTjnLQ2ii6IGKYEFSc9vnUoRnYXlMZuZ5/gDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T14:10:18.246738Z"},"content_sha256":"2933f12b60891a09587284bd2299849764d64b5b5588d23f3948f918fe91658c","schema_version":"1.0","event_id":"sha256:2933f12b60891a09587284bd2299849764d64b5b5588d23f3948f918fe91658c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:5PAHF7TLEYWWSYVPUV2ZLNVO24","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Kiwi-Edit: Versatile Video Editing via Instruction and Reference Guidance","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Kiwi-Edit achieves state-of-the-art results in controllable video editing by combining instructions with reference images through a new data pipeline and architecture.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Guoqiang Liang, Mike Zheng Shou, Yanzhe Chen, Yiqi Lin, Zechen Bai, Ziyun Zeng","submitted_at":"2026-03-02T18:46:28Z","abstract_excerpt":"Instruction-based video editing has witnessed rapid progress, yet current methods often struggle with precise visual control, as natural language is inherently limited in describing complex visual nuances. Although reference-guided editing offers a robust solution, its potential is currently bottlenecked by the scarcity of high-quality paired training data. To bridge this gap, we introduce a scalable data generation pipeline that transforms existing video editing pairs into high-fidelity training quadruplets, leveraging image generative models to create synthesized reference scaffolds. Using t"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our model achieves significant gains in instruction following and reference fidelity via a progressive multi-stage training curriculum. Extensive experiments demonstrate that our data and architecture establish a new state-of-the-art in controllable video editing.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The image generative models used in the data pipeline produce synthesized reference scaffolds that are high-fidelity and unbiased enough to train a model that generalizes to real user-provided references without introducing artifacts or distribution shifts.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Kiwi-Edit introduces a scalable pipeline to generate RefVIE dataset and a unified model using learnable queries plus reference features to achieve new state-of-the-art in instruction-and-reference guided video editing.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Kiwi-Edit achieves state-of-the-art results in controllable video editing by combining instructions with reference images through a new data pipeline and architecture.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"e18ce611c993c105eeb645cb594d0d4ba9bc8bdc9675158d47d432bbde5b9b0d"},"source":{"id":"2603.02175","kind":"arxiv","version":4},"verdict":{"id":"21c21a12-9247-445a-99e0-513e83cb9b5a","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T17:32:43.981091Z","strongest_claim":"Our model achieves significant gains in instruction following and reference fidelity via a progressive multi-stage training curriculum. Extensive experiments demonstrate that our data and architecture establish a new state-of-the-art in controllable video editing.","one_line_summary":"Kiwi-Edit introduces a scalable pipeline to generate RefVIE dataset and a unified model using learnable queries plus reference features to achieve new state-of-the-art in instruction-and-reference guided video editing.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The image generative models used in the data pipeline produce synthesized reference scaffolds that are high-fidelity and unbiased enough to train a model that generalizes to real user-provided references without introducing artifacts or distribution shifts.","pith_extraction_headline":"Kiwi-Edit achieves state-of-the-art results in controllable video editing by combining instructions with reference images through a new data pipeline and architecture."},"references":{"count":32,"sample":[{"doi":"","year":null,"title":"- Object identity, attributes (color, shape, material, style), and edit type must be consistent","work_id":"d5d96ab4-c93f-4e63-ae26-2ad1f7cbb1a8","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"- Coherent structure, plausible lighting and texture","work_id":"b6eb58bc-17f3-42e1-a954-5cb7b805697a","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Object not swapped/added, or a completely unrelated object appears","work_id":"1e0d2148-6d9f-4a08-b6ec-661e90d27fa9","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Object is changed, but looks nothing like the reference image (wrong color, shape, or class)","work_id":"3db0ad97-159f-4a41-870d-ed838e14bd14","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Object class is correct, but identity details (texture, specific markings, logos) differ significantly from the reference image","work_id":"0da8ff03-155b-4fe6-a9de-889d62aece5d","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":32,"snapshot_sha256":"4bb9d83768bf47e9c83c3bc6bd7f64d35cbd3aa612d627a0e0fb0bda30271b1e","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"21c21a12-9247-445a-99e0-513e83cb9b5a"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:23Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"RU1Bre2nnULqXsMDs65Ywsfu496rK7kb0o/+8bCMDmF5XVYfa37Xb/6qk9gJA5eCYZqEJdrY4PkDm/cPppxdAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T14:10:18.247524Z"},"content_sha256":"9abc1cd8e68e1e0ae72c5fe831e426950616a74e2cc542fa5016f433730ba652","schema_version":"1.0","event_id":"sha256:9abc1cd8e68e1e0ae72c5fe831e426950616a74e2cc542fa5016f433730ba652"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/5PAHF7TLEYWWSYVPUV2ZLNVO24/bundle.json","state_url":"https://pith.science/pith/5PAHF7TLEYWWSYVPUV2ZLNVO24/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/5PAHF7TLEYWWSYVPUV2ZLNVO24/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T14:10:18Z","links":{"resolver":"https://pith.science/pith/5PAHF7TLEYWWSYVPUV2ZLNVO24","bundle":"https://pith.science/pith/5PAHF7TLEYWWSYVPUV2ZLNVO24/bundle.json","state":"https://pith.science/pith/5PAHF7TLEYWWSYVPUV2ZLNVO24/state.json","well_known_bundle":"https://pith.science/.well-known/pith/5PAHF7TLEYWWSYVPUV2ZLNVO24/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:5PAHF7TLEYWWSYVPUV2ZLNVO24","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"1d4d9c9f52287d7737edd4900dfb6e570d716a7a4a526f3592fcdcb24500696d","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-02T18:46:28Z","title_canon_sha256":"e82812b8063cd874a10adc3deb3298719b67b3b54e2993ac6ab820884bcb1180"},"schema_version":"1.0","source":{"id":"2603.02175","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.02175","created_at":"2026-05-18T03:09:23Z"},{"alias_kind":"arxiv_version","alias_value":"2603.02175v4","created_at":"2026-05-18T03:09:23Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.02175","created_at":"2026-05-18T03:09:23Z"},{"alias_kind":"pith_short_12","alias_value":"5PAHF7TLEYWW","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"5PAHF7TLEYWWSYVP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"5PAHF7TL","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:9abc1cd8e68e1e0ae72c5fe831e426950616a74e2cc542fa5016f433730ba652","target":"graph","created_at":"2026-05-18T03:09:23Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Our model achieves significant gains in instruction following and reference fidelity via a progressive multi-stage training curriculum. Extensive experiments demonstrate that our data and architecture establish a new state-of-the-art in controllable video editing."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The image generative models used in the data pipeline produce synthesized reference scaffolds that are high-fidelity and unbiased enough to train a model that generalizes to real user-provided references without introducing artifacts or distribution shifts."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Kiwi-Edit introduces a scalable pipeline to generate RefVIE dataset and a unified model using learnable queries plus reference features to achieve new state-of-the-art in instruction-and-reference guided video editing."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Kiwi-Edit achieves state-of-the-art results in controllable video editing by combining instructions with reference images through a new data pipeline and architecture."}],"snapshot_sha256":"e18ce611c993c105eeb645cb594d0d4ba9bc8bdc9675158d47d432bbde5b9b0d"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Instruction-based video editing has witnessed rapid progress, yet current methods often struggle with precise visual control, as natural language is inherently limited in describing complex visual nuances. Although reference-guided editing offers a robust solution, its potential is currently bottlenecked by the scarcity of high-quality paired training data. To bridge this gap, we introduce a scalable data generation pipeline that transforms existing video editing pairs into high-fidelity training quadruplets, leveraging image generative models to create synthesized reference scaffolds. Using t","authors_text":"Guoqiang Liang, Mike Zheng Shou, Yanzhe Chen, Yiqi Lin, Zechen Bai, Ziyun Zeng","cross_cats":["cs.AI"],"headline":"Kiwi-Edit achieves state-of-the-art results in controllable video editing by combining instructions with reference images through a new data pipeline and architecture.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-02T18:46:28Z","title":"Kiwi-Edit: Versatile Video Editing via Instruction and Reference Guidance"},"references":{"count":32,"internal_anchors":0,"resolved_work":32,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"- Object identity, attributes (color, shape, material, style), and edit type must be consistent","work_id":"d5d96ab4-c93f-4e63-ae26-2ad1f7cbb1a8","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"- Coherent structure, plausible lighting and texture","work_id":"b6eb58bc-17f3-42e1-a954-5cb7b805697a","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Object not swapped/added, or a completely unrelated object appears","work_id":"1e0d2148-6d9f-4a08-b6ec-661e90d27fa9","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Object is changed, but looks nothing like the reference image (wrong color, shape, or class)","work_id":"3db0ad97-159f-4a41-870d-ed838e14bd14","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Object class is correct, but identity details (texture, specific markings, logos) differ significantly from the reference image","work_id":"0da8ff03-155b-4fe6-a9de-889d62aece5d","year":null}],"snapshot_sha256":"4bb9d83768bf47e9c83c3bc6bd7f64d35cbd3aa612d627a0e0fb0bda30271b1e"},"source":{"id":"2603.02175","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-15T17:32:43.981091Z","id":"21c21a12-9247-445a-99e0-513e83cb9b5a","model_set":{"reader":"grok-4.3"},"one_line_summary":"Kiwi-Edit introduces a scalable pipeline to generate RefVIE dataset and a unified model using learnable queries plus reference features to achieve new state-of-the-art in instruction-and-reference guided video editing.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Kiwi-Edit achieves state-of-the-art results in controllable video editing by combining instructions with reference images through a new data pipeline and architecture.","strongest_claim":"Our model achieves significant gains in instruction following and reference fidelity via a progressive multi-stage training curriculum. Extensive experiments demonstrate that our data and architecture establish a new state-of-the-art in controllable video editing.","weakest_assumption":"The image generative models used in the data pipeline produce synthesized reference scaffolds that are high-fidelity and unbiased enough to train a model that generalizes to real user-provided references without introducing artifacts or distribution shifts."}},"verdict_id":"21c21a12-9247-445a-99e0-513e83cb9b5a"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2933f12b60891a09587284bd2299849764d64b5b5588d23f3948f918fe91658c","target":"record","created_at":"2026-05-18T03:09:23Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"1d4d9c9f52287d7737edd4900dfb6e570d716a7a4a526f3592fcdcb24500696d","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-02T18:46:28Z","title_canon_sha256":"e82812b8063cd874a10adc3deb3298719b67b3b54e2993ac6ab820884bcb1180"},"schema_version":"1.0","source":{"id":"2603.02175","kind":"arxiv","version":4}},"canonical_sha256":"ebc072fe6b262d6962afa57595b6aed732be15f207cb133df498536b061d0300","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"ebc072fe6b262d6962afa57595b6aed732be15f207cb133df498536b061d0300","first_computed_at":"2026-05-18T03:09:23.219439Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:09:23.219439Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"xz2r4KhtQNeBxFY4DuWN+shrryLYwgdVUQ8/ucs59h++d0G+brjbwTjlERllAYJ/aTjg4RoTpWDpFB/ZUbU0Bw==","signature_status":"signed_v1","signed_at":"2026-05-18T03:09:23.220217Z","signed_message":"canonical_sha256_bytes"},"source_id":"2603.02175","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2933f12b60891a09587284bd2299849764d64b5b5588d23f3948f918fe91658c","sha256:9abc1cd8e68e1e0ae72c5fe831e426950616a74e2cc542fa5016f433730ba652"],"state_sha256":"c1e1c1ca7b73b65855a75092ea3f27aa2d1c28af331de25aabcb124e12dad5ba"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"TeXZBgRfqddumg0SVKk2KNDwzufEkd1RWwZ3ngx0YxlXxIwhGKYd3AInl3JdHkBV9jNdHjRDA2PBLD1EUVMEAQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T14:10:18.251367Z","bundle_sha256":"6ae997295943e3fd08872faeb3402482d384e88b9d8e1d68f3422f49d00da5cc"}}