{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:UDEIT65RSITYNSEOOGE6RC2CSO","short_pith_number":"pith:UDEIT65R","schema_version":"1.0","canonical_sha256":"a0c889fbb1922786c88e7189e88b42939b92c00d8e249bb4792bd5415bbf34ed","source":{"kind":"arxiv","id":"2308.08089","version":1},"attestation_state":"computed","paper":{"title":"DragNUWA: Fine-grained Control in Video Generation by Integrating Text, Image, and Trajectory","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chenfei Wu, Gong Ming, Houqiang Li, Jian Liang, Jie Shi, Nan Duan, Shengming Yin","submitted_at":"2023-08-16T01:43:41Z","abstract_excerpt":"Controllable video generation has gained significant attention in recent years. However, two main limitations persist: Firstly, most existing works focus on either text, image, or trajectory-based control, leading to an inability to achieve fine-grained control in videos. Secondly, trajectory control research is still in its early stages, with most experiments being conducted on simple datasets like Human3.6M. This constraint limits the models' capability to process open-domain images and effectively handle complex curved trajectories. In this paper, we propose DragNUWA, an open-domain diffusi"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2308.08089","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-08-16T01:43:41Z","cross_cats_sorted":[],"title_canon_sha256":"fdcf8dc07f3c93b5e998c1ac44f384844deca24555f18ec762fd1eb5c2e25878","abstract_canon_sha256":"351064fbd672018a550433e0ada1792c53cbca346b8a1b22b6957f1d1fc00e99"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T12:56:56.092425Z","signature_b64":"+T8orEuGVy7A2gU35RETV0Kv6dZbG7xd7ewzwDQLI4rANO3esmrRThzq2sBISkR5TsY9p0b8JHNn1pYN/GS6Bw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a0c889fbb1922786c88e7189e88b42939b92c00d8e249bb4792bd5415bbf34ed","last_reissued_at":"2026-05-20T12:56:56.090709Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T12:56:56.090709Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"DragNUWA: Fine-grained Control in Video Generation by Integrating Text, Image, and Trajectory","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chenfei Wu, Gong Ming, Houqiang Li, Jian Liang, Jie Shi, Nan Duan, Shengming Yin","submitted_at":"2023-08-16T01:43:41Z","abstract_excerpt":"Controllable video generation has gained significant attention in recent years. However, two main limitations persist: Firstly, most existing works focus on either text, image, or trajectory-based control, leading to an inability to achieve fine-grained control in videos. Secondly, trajectory control research is still in its early stages, with most experiments being conducted on simple datasets like Human3.6M. This constraint limits the models' capability to process open-domain images and effectively handle complex curved trajectories. In this paper, we propose DragNUWA, an open-domain diffusi"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2308.08089","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2308.08089/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2308.08089","created_at":"2026-05-20T12:56:56.090787+00:00"},{"alias_kind":"arxiv_version","alias_value":"2308.08089v1","created_at":"2026-05-20T12:56:56.090787+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2308.08089","created_at":"2026-05-20T12:56:56.090787+00:00"},{"alias_kind":"pith_short_12","alias_value":"UDEIT65RSITY","created_at":"2026-05-20T12:56:56.090787+00:00"},{"alias_kind":"pith_short_16","alias_value":"UDEIT65RSITYNSEO","created_at":"2026-05-20T12:56:56.090787+00:00"},{"alias_kind":"pith_short_8","alias_value":"UDEIT65R","created_at":"2026-05-20T12:56:56.090787+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":23,"internal_anchor_count":23,"sample":[{"citing_arxiv_id":"2605.18010","citing_title":"Functionalization via Structure Completion and Motion Rectification","ref_index":269,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19728","citing_title":"Aero-World: Action-Conditioned Aerial Video Generation from Inertial Controls","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15256","citing_title":"ReactiveGWM: Steering NPC in Reactive Game World Models","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2510.01186","citing_title":"ASTRA: Let Arbitrary Subjects Transform in Video Editing","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2511.00503","citing_title":"Diff4Splat: Controllable 4D Scene Generation with Latent Dynamic Reconstruction Models","ref_index":99,"is_internal_anchor":true},{"citing_arxiv_id":"2311.04145","citing_title":"I2VGen-XL: High-Quality Image-to-Video Synthesis via Cascaded Diffusion Models","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13838","citing_title":"R-DMesh: Video-Guided 3D Animation via Rectified Dynamic Mesh Flow","ref_index":159,"is_internal_anchor":true},{"citing_arxiv_id":"2310.19512","citing_title":"VideoCrafter1: Open Diffusion Models for High-Quality Video Generation","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13838","citing_title":"R-DMesh: Video-Guided 3D Animation via Rectified Dynamic Mesh Flow","ref_index":159,"is_internal_anchor":true},{"citing_arxiv_id":"2604.03305","citing_title":"HVG-3D: Bridging Real and Simulation Domains for 3D-Conditional Hand-Object Interaction Video Synthesis","ref_index":79,"is_internal_anchor":true},{"citing_arxiv_id":"2409.02048","citing_title":"ViewCrafter: Taming Video Diffusion Models for High-fidelity Novel View Synthesis","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06280","citing_title":"Eulerian Motion Guidance: Robust Image Animation via Bidirectional Geometric Consistency","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2404.02101","citing_title":"CameraCtrl: Enabling Camera Control for Text-to-Video Generation","ref_index":163,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03637","citing_title":"Bridging the Embodiment Gap: Disentangled Cross-Embodiment Video Editing","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2604.28169","citing_title":"PhyCo: Learning Controllable Physical Priors for Generative Motion","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06280","citing_title":"Eulerian Motion Guidance: Robust Image Animation via Bidirectional Geometric Consistency","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2604.09057","citing_title":"Tora3: Trajectory-Guided Audio-Video Generation with Physical Coherence","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07348","citing_title":"MoRight: Motion Control Done Right","ref_index":86,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06280","citing_title":"Eulerian Motion Guidance: Robust Image Animation via Bidirectional Geometric Consistency","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07758","citing_title":"DailyArt: Discovering Articulation from Single Static Images via Latent Dynamics","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06339","citing_title":"Evolution of Video Generative Foundations","ref_index":222,"is_internal_anchor":true},{"citing_arxiv_id":"2307.04725","citing_title":"AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2604.21776","citing_title":"Reshoot-Anything: A Self-Supervised Model for In-the-Wild Video Reshooting","ref_index":47,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/UDEIT65RSITYNSEOOGE6RC2CSO","json":"https://pith.science/pith/UDEIT65RSITYNSEOOGE6RC2CSO.json","graph_json":"https://pith.science/api/pith-number/UDEIT65RSITYNSEOOGE6RC2CSO/graph.json","events_json":"https://pith.science/api/pith-number/UDEIT65RSITYNSEOOGE6RC2CSO/events.json","paper":"https://pith.science/paper/UDEIT65R"},"agent_actions":{"view_html":"https://pith.science/pith/UDEIT65RSITYNSEOOGE6RC2CSO","download_json":"https://pith.science/pith/UDEIT65RSITYNSEOOGE6RC2CSO.json","view_paper":"https://pith.science/paper/UDEIT65R","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2308.08089&json=true","fetch_graph":"https://pith.science/api/pith-number/UDEIT65RSITYNSEOOGE6RC2CSO/graph.json","fetch_events":"https://pith.science/api/pith-number/UDEIT65RSITYNSEOOGE6RC2CSO/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/UDEIT65RSITYNSEOOGE6RC2CSO/action/timestamp_anchor","attest_storage":"https://pith.science/pith/UDEIT65RSITYNSEOOGE6RC2CSO/action/storage_attestation","attest_author":"https://pith.science/pith/UDEIT65RSITYNSEOOGE6RC2CSO/action/author_attestation","sign_citation":"https://pith.science/pith/UDEIT65RSITYNSEOOGE6RC2CSO/action/citation_signature","submit_replication":"https://pith.science/pith/UDEIT65RSITYNSEOOGE6RC2CSO/action/replication_record"}},"created_at":"2026-05-20T12:56:56.090787+00:00","updated_at":"2026-05-20T12:56:56.090787+00:00"}