{"work":{"id":"02efb197-4e2b-4141-8587-21b92ad92f08","openalex_id":null,"doi":null,"arxiv_id":"2512.13507","raw_key":null,"title":"Seedance 1.5 pro: A Native Audio-Visual Joint Generation Foundation Model","authors":null,"authors_text":"Team Seedance, Heyi Chen, Siyan Chen, Xin Chen, Yanfei Chen, Ying Chen","year":2025,"venue":"cs.CV","abstract":"Recent strides in video generation have paved the way for unified audio-visual generation. In this work, we present Seedance 1.5 pro, a foundational model engineered specifically for native, joint audio-video generation. Leveraging a dual-branch Diffusion Transformer architecture, the model integrates a cross-modal joint module with a specialized multi-stage data pipeline, achieving exceptional audio-visual synchronization and superior generation quality. To ensure practical utility, we implement meticulous post-training optimizations, including Supervised Fine-Tuning (SFT) on high-quality datasets and Reinforcement Learning from Human Feedback (RLHF) with multi-dimensional reward models. Furthermore, we introduce an acceleration framework that boosts inference speed by over 10X. Seedance 1.5 pro distinguishes itself through precise multilingual and dialect lip-syncing, dynamic cinematic camera control, and enhanced narrative coherence, positioning it as a robust engine for professional-grade content creation. Seedance 1.5 pro is now accessible on Volcano Engine at https://console.volcengine.com/ark/region:ark+cn-beijing/experience/vision?type=GenVideo.","external_url":"https://arxiv.org/abs/2512.13507","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-22T07:14:42.800740+00:00","pith_arxiv_id":"2512.13507","created_at":"2026-05-09T05:55:29.237338+00:00","updated_at":"2026-05-22T07:14:42.800740+00:00","title_quality_ok":true,"display_title":"Seedance 1.5 pro: A Native Audio-Visual Joint Generation Foundation Model","render_title":"Seedance 1.5 pro: A Native Audio-Visual Joint Generation Foundation Model"},"hub":{"state":{"work_id":"02efb197-4e2b-4141-8587-21b92ad92f08","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":21,"external_cited_by_count":null,"distinct_field_count":3,"first_pith_cited_at":"2026-01-28T12:37:01+00:00","last_pith_cited_at":"2026-05-21T14:48:35+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-05T05:48:35.872757+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":14}],"polarity_counts":[{"context_polarity":"background","n":14}],"runs":{},"summary":{},"graph":{},"authors":[]}}