{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:3WCZOPNLJESLCIYOOTKNPG4SY5","short_pith_number":"pith:3WCZOPNL","canonical_record":{"source":{"id":"2605.15196","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:59:52Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"877ed8b8f595c87067d2944d4d25e4feb3db485a9683b647abc3b1396daad233","abstract_canon_sha256":"df03884989f4ed1d6106661a2fcf979c3db31a6fc5086a6be40cf1ae8869b776"},"schema_version":"1.0"},"canonical_sha256":"dd85973dab4924b1230e74d4d79b92c766ae3cc8f3c755bb50c38597b8272a8d","source":{"kind":"arxiv","id":"2605.15196","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15196","created_at":"2026-05-17T21:18:32Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15196v1","created_at":"2026-05-17T21:18:32Z"},{"alias_kind":"pith_short_12","alias_value":"3WCZOPNLJESL","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"3WCZOPNLJESLCIYO","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"3WCZOPNL","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:3WCZOPNLJESLCIYOOTKNPG4SY5","target":"record","payload":{"canonical_record":{"source":{"id":"2605.15196","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:59:52Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"877ed8b8f595c87067d2944d4d25e4feb3db485a9683b647abc3b1396daad233","abstract_canon_sha256":"df03884989f4ed1d6106661a2fcf979c3db31a6fc5086a6be40cf1ae8869b776"},"schema_version":"1.0"},"canonical_sha256":"dd85973dab4924b1230e74d4d79b92c766ae3cc8f3c755bb50c38597b8272a8d","receipt":{"kind":"pith_receipt","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.2","canonical_sha256":"dd85973dab4924b1230e74d4d79b92c766ae3cc8f3c755bb50c38597b8272a8d","last_reissued_at":"2026-05-17T21:57:18.416489Z","signature_status":"unsigned_v0","first_computed_at":"2026-05-17T21:40:25.018212Z"},"source_kind":"arxiv","source_id":"2605.15196","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T21:18:32Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"tFLgl7L8Yz1D6mb8KXU2tBmBcaAQEseJkKMzD68XpTe+T+84JeIX+jJ9VpaFV6kX/tTVwGuAunwzh7aWEC1SAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T05:14:34.723988Z"},"content_sha256":"9641f93ca0f61004e0bbbe3d0ec21e16e489d632f4cf6d74568d9216303f8391","schema_version":"1.0","event_id":"sha256:9641f93ca0f61004e0bbbe3d0ec21e16e489d632f4cf6d74568d9216303f8391"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:3WCZOPNLJESLCIYOOTKNPG4SY5","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"RefDecoder: Enhancing Visual Generation with Conditional Video Decoding","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"RefDecoder adds reference-image conditioning to video VAE decoders through attention, yielding up to 2.1 dB PSNR gains and better consistency on I2V, editing, and style-transfer tasks.","cross_cats":["cs.LG"],"primary_cat":"cs.CV","authors_text":"Bohan Fang, Ranjay Krishna, Xiang Fan, Yuheng Wang, Zhongzheng Ren","submitted_at":"2026-05-14T17:59:52Z","abstract_excerpt":"Video generation powers a vast array of downstream applications. However, while the de facto standard, i.e., latent diffusion models, typically employ heavily conditioned denoising networks, their decoders often remain unconditional. We observe that this architectural asymmetry leads to significant loss of detail and inconsistency relative to the input image. To address this, we argue that the decoder requires equal conditioning to preserve structural integrity. We introduce RefDecoder, a reference-conditioned video VAE decoder by injecting high-fidelity reference image signal directly into th"},"claims":{"count":3,"items":[{"kind":"strongest_claim","text":"We introduce RefDecoder, a reference-conditioned video VAE decoder by injecting high-fidelity reference image signal directly into the decoding process via reference attention... achieving up to +2.1dB PSNR over the unconditional baselines on the Inter4K, WebVid, and Large Motion reconstruction benchmarks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That equal conditioning of the decoder via reference attention is sufficient to preserve structural integrity without introducing new artifacts or requiring any fine-tuning of the rest of the pipeline.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"RefDecoder adds reference-image conditioning to video VAE decoders through attention, yielding up to 2.1 dB PSNR gains and better consistency on I2V, editing, and style-transfer tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"}],"snapshot_sha256":"1e6d0f547894b1293e112af18406e9e0dfd045446e49890a615bddb829953c42"},"source":{"id":"2605.15196","kind":"arxiv","version":1},"verdict":{"id":"4c7596fd-74e6-49d4-9265-0787753f1d19","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T03:17:21.949834Z","strongest_claim":"We introduce RefDecoder, a reference-conditioned video VAE decoder by injecting high-fidelity reference image signal directly into the decoding process via reference attention... achieving up to +2.1dB PSNR over the unconditional baselines on the Inter4K, WebVid, and Large Motion reconstruction benchmarks.","one_line_summary":"RefDecoder adds reference-image conditioning to video VAE decoders through attention, yielding up to 2.1 dB PSNR gains and better consistency on I2V, editing, and style-transfer tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That equal conditioning of the decoder via reference attention is sufficient to preserve structural integrity without introducing new artifacts or requiring any fine-tuning of the rest of the pipeline.","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":1,"snapshot_sha256":"6380e121a5303fa3071acc26193bf3050185e00176bb4bad61cce154b93d7be0"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"4c7596fd-74e6-49d4-9265-0787753f1d19"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T21:57:18Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"kmEynCDfKL8CdXZ8KxOO5Wp1tTAnGgnVDIe+m3pMrunXwRx2oVNGdT4bTBPh6S/zwO0MhfHTvoaBrOiOH7DCBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T05:14:34.724809Z"},"content_sha256":"11fa895a320308b66f7a8e7975956c75479924c34fef96121827308033b376d9","schema_version":"1.0","event_id":"sha256:11fa895a320308b66f7a8e7975956c75479924c34fef96121827308033b376d9"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/3WCZOPNLJESLCIYOOTKNPG4SY5/bundle.json","state_url":"https://pith.science/pith/3WCZOPNLJESLCIYOOTKNPG4SY5/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/3WCZOPNLJESLCIYOOTKNPG4SY5/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T05:14:34Z","links":{"resolver":"https://pith.science/pith/3WCZOPNLJESLCIYOOTKNPG4SY5","bundle":"https://pith.science/pith/3WCZOPNLJESLCIYOOTKNPG4SY5/bundle.json","state":"https://pith.science/pith/3WCZOPNLJESLCIYOOTKNPG4SY5/state.json","well_known_bundle":"https://pith.science/.well-known/pith/3WCZOPNLJESLCIYOOTKNPG4SY5/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:3WCZOPNLJESLCIYOOTKNPG4SY5","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"df03884989f4ed1d6106661a2fcf979c3db31a6fc5086a6be40cf1ae8869b776","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:59:52Z","title_canon_sha256":"877ed8b8f595c87067d2944d4d25e4feb3db485a9683b647abc3b1396daad233"},"schema_version":"1.0","source":{"id":"2605.15196","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15196","created_at":"2026-05-17T21:18:32Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15196v1","created_at":"2026-05-17T21:18:32Z"},{"alias_kind":"pith_short_12","alias_value":"3WCZOPNLJESL","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"3WCZOPNLJESLCIYO","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"3WCZOPNL","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:11fa895a320308b66f7a8e7975956c75479924c34fef96121827308033b376d9","target":"graph","created_at":"2026-05-17T21:57:18Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":3,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We introduce RefDecoder, a reference-conditioned video VAE decoder by injecting high-fidelity reference image signal directly into the decoding process via reference attention... achieving up to +2.1dB PSNR over the unconditional baselines on the Inter4K, WebVid, and Large Motion reconstruction benchmarks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That equal conditioning of the decoder via reference attention is sufficient to preserve structural integrity without introducing new artifacts or requiring any fine-tuning of the rest of the pipeline."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"RefDecoder adds reference-image conditioning to video VAE decoders through attention, yielding up to 2.1 dB PSNR gains and better consistency on I2V, editing, and style-transfer tasks."}],"snapshot_sha256":"1e6d0f547894b1293e112af18406e9e0dfd045446e49890a615bddb829953c42"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"6380e121a5303fa3071acc26193bf3050185e00176bb4bad61cce154b93d7be0"},"paper":{"abstract_excerpt":"Video generation powers a vast array of downstream applications. However, while the de facto standard, i.e., latent diffusion models, typically employ heavily conditioned denoising networks, their decoders often remain unconditional. We observe that this architectural asymmetry leads to significant loss of detail and inconsistency relative to the input image. To address this, we argue that the decoder requires equal conditioning to preserve structural integrity. We introduce RefDecoder, a reference-conditioned video VAE decoder by injecting high-fidelity reference image signal directly into th","authors_text":"Bohan Fang, Ranjay Krishna, Xiang Fan, Yuheng Wang, Zhongzheng Ren","cross_cats":["cs.LG"],"headline":"RefDecoder adds reference-image conditioning to video VAE decoders through attention, yielding up to 2.1 dB PSNR gains and better consistency on I2V, editing, and style-transfer tasks.","license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:59:52Z","title":"RefDecoder: Enhancing Visual Generation with Conditional Video Decoding"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.15196","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T03:17:21.949834Z","id":"4c7596fd-74e6-49d4-9265-0787753f1d19","model_set":{"reader":"grok-4.3"},"one_line_summary":"RefDecoder adds reference-image conditioning to video VAE decoders through attention, yielding up to 2.1 dB PSNR gains and better consistency on I2V, editing, and style-transfer tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"","strongest_claim":"We introduce RefDecoder, a reference-conditioned video VAE decoder by injecting high-fidelity reference image signal directly into the decoding process via reference attention... achieving up to +2.1dB PSNR over the unconditional baselines on the Inter4K, WebVid, and Large Motion reconstruction benchmarks.","weakest_assumption":"That equal conditioning of the decoder via reference attention is sufficient to preserve structural integrity without introducing new artifacts or requiring any fine-tuning of the rest of the pipeline."}},"verdict_id":"4c7596fd-74e6-49d4-9265-0787753f1d19"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:9641f93ca0f61004e0bbbe3d0ec21e16e489d632f4cf6d74568d9216303f8391","target":"record","created_at":"2026-05-17T21:18:32Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"df03884989f4ed1d6106661a2fcf979c3db31a6fc5086a6be40cf1ae8869b776","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:59:52Z","title_canon_sha256":"877ed8b8f595c87067d2944d4d25e4feb3db485a9683b647abc3b1396daad233"},"schema_version":"1.0","source":{"id":"2605.15196","kind":"arxiv","version":1}},"canonical_sha256":"dd85973dab4924b1230e74d4d79b92c766ae3cc8f3c755bb50c38597b8272a8d","receipt":{"builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"dd85973dab4924b1230e74d4d79b92c766ae3cc8f3c755bb50c38597b8272a8d","first_computed_at":"2026-05-17T21:40:25.018212Z","kind":"pith_receipt","last_reissued_at":"2026-05-17T21:57:18.416489Z","receipt_version":"0.2","signature_status":"unsigned_v0"},"source_id":"2605.15196","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:9641f93ca0f61004e0bbbe3d0ec21e16e489d632f4cf6d74568d9216303f8391","sha256:11fa895a320308b66f7a8e7975956c75479924c34fef96121827308033b376d9"],"state_sha256":"dfe6cb8fa7535a446be7fe0fd3d909073fbe79462e47e616bcfc098463f5ed6c"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"hktqAWDfNGkd3SWXDnxTAIsNWa7NmynhpTQjdfTQbkFJO90MA8+HB5HKWTvxkckP9bUvvoUadXNULcROjmLVCA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T05:14:34.728909Z","bundle_sha256":"9f5881221631b17ca11dd3efe95eef1bb957fa75f032848c7de67e9121ddae15"}}