{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:YMRSHNPDQOMVMXKBUBPXU3YXKM","short_pith_number":"pith:YMRSHNPD","schema_version":"1.0","canonical_sha256":"c32323b5e38399565d41a05f7a6f175304941fcbc88932d58cb1766d63ed4802","source":{"kind":"arxiv","id":"2502.07531","version":5},"attestation_state":"computed","paper":{"title":"VidCRAFT3: Camera, Object, and Lighting Control for Image-to-Video Generation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG","cs.MM"],"primary_cat":"cs.CV","authors_text":"Hang Xu, Sixiao Zheng, Xiangru Huang, Yanpeng Zhou, Yanwei Fu, Yi Zhu, Zimian Peng","submitted_at":"2025-02-11T13:11:59Z","abstract_excerpt":"Controllable image-to-video (I2V) generation transforms a reference image into a coherent video guided by user-specified control signals. While precise control over camera motion, object motion, and lighting is essential for high-fidelity creation, existing methods often treat these factors independently. This overlooks the physical coupling among viewpoint, geometry, and illumination in dynamic scenes, leading to visual inconsistencies such as mismatched shadows and perspective drift under simultaneous changes. We present VidCRAFT3, a unified and flexible I2V framework that explicitly models "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2502.07531","kind":"arxiv","version":5},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-02-11T13:11:59Z","cross_cats_sorted":["cs.AI","cs.LG","cs.MM"],"title_canon_sha256":"34f1bb5f283f800a24c3b26b8282f69eb907814c540fdfe1e34f4ad800551b6b","abstract_canon_sha256":"3beb228370084ba8507be4c4919775173d3e548405bf186b7f969ce965425c71"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:11:09.974543Z","signature_b64":"ErVFPn0HGaat4jMcnhws4h0gzoihipB2DDiEBMv6mnWRHAACpbNeaaVbgbluFJccpPGfzoaB6GNqPUKOBr2GBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c32323b5e38399565d41a05f7a6f175304941fcbc88932d58cb1766d63ed4802","last_reissued_at":"2026-06-19T16:11:09.974086Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:11:09.974086Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"VidCRAFT3: Camera, Object, and Lighting Control for Image-to-Video Generation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG","cs.MM"],"primary_cat":"cs.CV","authors_text":"Hang Xu, Sixiao Zheng, Xiangru Huang, Yanpeng Zhou, Yanwei Fu, Yi Zhu, Zimian Peng","submitted_at":"2025-02-11T13:11:59Z","abstract_excerpt":"Controllable image-to-video (I2V) generation transforms a reference image into a coherent video guided by user-specified control signals. While precise control over camera motion, object motion, and lighting is essential for high-fidelity creation, existing methods often treat these factors independently. This overlooks the physical coupling among viewpoint, geometry, and illumination in dynamic scenes, leading to visual inconsistencies such as mismatched shadows and perspective drift under simultaneous changes. We present VidCRAFT3, a unified and flexible I2V framework that explicitly models "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2502.07531","kind":"arxiv","version":5},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2502.07531/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2502.07531","created_at":"2026-06-19T16:11:09.974142+00:00"},{"alias_kind":"arxiv_version","alias_value":"2502.07531v5","created_at":"2026-06-19T16:11:09.974142+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.07531","created_at":"2026-06-19T16:11:09.974142+00:00"},{"alias_kind":"pith_short_12","alias_value":"YMRSHNPDQOMV","created_at":"2026-06-19T16:11:09.974142+00:00"},{"alias_kind":"pith_short_16","alias_value":"YMRSHNPDQOMVMXKB","created_at":"2026-06-19T16:11:09.974142+00:00"},{"alias_kind":"pith_short_8","alias_value":"YMRSHNPD","created_at":"2026-06-19T16:11:09.974142+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2604.24764","citing_title":"World-R1: Reinforcing 3D Constraints for Text-to-Video Generation","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22818","citing_title":"MotiMotion: Motion-Controlled Video Generation with Visual Reasoning","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24764","citing_title":"World-R1: Reinforcing 3D Constraints for Text-to-Video Generation","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2604.17565","citing_title":"UniGeo: Unifying Geometric Guidance for Camera-Controllable Image Editing via Video Models","ref_index":93,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24764","citing_title":"World-R1: Reinforcing 3D Constraints for Text-to-Video Generation","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2604.17565","citing_title":"UniGeo: Unifying Geometric Guidance for Camera-Controllable Image Editing via Video Models","ref_index":93,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YMRSHNPDQOMVMXKBUBPXU3YXKM","json":"https://pith.science/pith/YMRSHNPDQOMVMXKBUBPXU3YXKM.json","graph_json":"https://pith.science/api/pith-number/YMRSHNPDQOMVMXKBUBPXU3YXKM/graph.json","events_json":"https://pith.science/api/pith-number/YMRSHNPDQOMVMXKBUBPXU3YXKM/events.json","paper":"https://pith.science/paper/YMRSHNPD"},"agent_actions":{"view_html":"https://pith.science/pith/YMRSHNPDQOMVMXKBUBPXU3YXKM","download_json":"https://pith.science/pith/YMRSHNPDQOMVMXKBUBPXU3YXKM.json","view_paper":"https://pith.science/paper/YMRSHNPD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2502.07531&json=true","fetch_graph":"https://pith.science/api/pith-number/YMRSHNPDQOMVMXKBUBPXU3YXKM/graph.json","fetch_events":"https://pith.science/api/pith-number/YMRSHNPDQOMVMXKBUBPXU3YXKM/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YMRSHNPDQOMVMXKBUBPXU3YXKM/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YMRSHNPDQOMVMXKBUBPXU3YXKM/action/storage_attestation","attest_author":"https://pith.science/pith/YMRSHNPDQOMVMXKBUBPXU3YXKM/action/author_attestation","sign_citation":"https://pith.science/pith/YMRSHNPDQOMVMXKBUBPXU3YXKM/action/citation_signature","submit_replication":"https://pith.science/pith/YMRSHNPDQOMVMXKBUBPXU3YXKM/action/replication_record"}},"created_at":"2026-06-19T16:11:09.974142+00:00","updated_at":"2026-06-19T16:11:09.974142+00:00"}