{"work":{"id":"52d502bd-9d8e-4944-9bf2-cfd097cfdb4e","openalex_id":null,"doi":null,"arxiv_id":"2512.16776","raw_key":null,"title":"Kling-Omni Technical Report","authors":null,"authors_text":"Kling Team: Jialu Chen, Yuanzheng Ci, Xiangyu Du, Zipeng Feng, Kun Gai, Sainan Guo","year":2025,"venue":"cs.CV","abstract":"We present Kling-Omni, a generalist generative framework designed to synthesize high-fidelity videos directly from multimodal visual language inputs. Adopting an end-to-end perspective, Kling-Omni bridges the functional separation among diverse video generation, editing, and intelligent reasoning tasks, integrating them into a holistic system. Unlike disjointed pipeline approaches, Kling-Omni supports a diverse range of user inputs, including text instructions, reference images, and video contexts, processing them into a unified multimodal representation to deliver cinematic-quality and highly-intelligent video content creation. To support these capabilities, we constructed a comprehensive data system that serves as the foundation for multimodal video creation. The framework is further empowered by efficient large-scale pre-training strategies and infrastructure optimizations for inference. Comprehensive evaluations reveal that Kling-Omni demonstrates exceptional capabilities in in-context generation, reasoning-based editing, and multimodal instruction following. Moving beyond a content creation tool, we believe Kling-Omni is a pivotal advancement toward multimodal world simulators capable of perceiving, reasoning, generating and interacting with the dynamic and complex worlds.","external_url":"https://arxiv.org/abs/2512.16776","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-16T07:10:43.158779+00:00","pith_arxiv_id":"2512.16776","created_at":"2026-05-09T06:05:34.502931+00:00","updated_at":"2026-05-16T07:10:43.158779+00:00","title_quality_ok":false,"display_title":"Kling-Omni Technical Report","render_title":"Kling-Omni Technical Report"},"hub":{"state":{"work_id":"52d502bd-9d8e-4944-9bf2-cfd097cfdb4e","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":19,"external_cited_by_count":null,"distinct_field_count":3,"first_pith_cited_at":"2026-02-05T14:04:51+00:00","last_pith_cited_at":"2026-05-14T10:19:19+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-16T13:18:46.347140+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":5}],"polarity_counts":[{"context_polarity":"background","n":5}],"runs":{},"summary":{},"graph":{},"authors":[]}}