{"work":{"id":"d690be8f-5d53-49b0-b1e7-79668eb8fcdb","openalex_id":null,"doi":null,"arxiv_id":"2602.02276","raw_key":null,"title":"Kimi K2.5: Visual Agentic Intelligence","authors":null,"authors_text":"Kimi Team: Tongtong Bai, Yifan Bai, Yiping Bao, S.H. Cai, Yuan Cao, Y. Charles","year":2026,"venue":"cs.CL","abstract":"We introduce Kimi K2.5, an open-source multimodal agentic model designed to advance general agentic intelligence. K2.5 emphasizes the joint optimization of text and vision so that two modalities enhance each other. This includes a series of techniques such as joint text-vision pre-training, zero-vision SFT, and joint text-vision reinforcement learning. Building on this multimodal foundation, K2.5 introduces Agent Swarm, a self-directed parallel agent orchestration framework that dynamically decomposes complex tasks into heterogeneous sub-problems and executes them concurrently. Extensive evaluations show that Kimi K2.5 achieves state-of-the-art results across various domains including coding, vision, reasoning, and agentic tasks. Agent Swarm also reduces latency by up to $4.5\\times$ over single-agent baselines. We release the post-trained Kimi K2.5 model checkpoint to facilitate future research and real-world applications of agentic intelligence.","external_url":"https://arxiv.org/abs/2602.02276","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T06:10:24.094704+00:00","pith_arxiv_id":"2602.02276","created_at":"2026-05-09T05:55:31.271647+00:00","updated_at":"2026-05-25T06:10:24.094704+00:00","title_quality_ok":true,"display_title":"Kimi K2.5: Visual Agentic Intelligence","render_title":"Kimi K2.5: Visual Agentic Intelligence"},"hub":{"state":{"work_id":"d690be8f-5d53-49b0-b1e7-79668eb8fcdb","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":132,"external_cited_by_count":null,"distinct_field_count":14,"first_pith_cited_at":"2026-02-17T17:50:56+00:00","last_pith_cited_at":"2026-05-22T17:59:38+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-30T11:01:03.801894+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":41},{"context_role":"baseline","n":13},{"context_role":"method","n":3},{"context_role":"other","n":2}],"polarity_counts":[{"context_polarity":"background","n":40},{"context_polarity":"baseline","n":13},{"context_polarity":"unclear","n":3},{"context_polarity":"use_method","n":3}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"Kimi K2.5: Visual Agentic Intelligence","claims":[{"claim_text":"We introduce Kimi K2.5, an open-source multimodal agentic model designed to advance general agentic intelligence. K2.5 emphasizes the joint optimization of text and vision so that two modalities enhance each other. This includes a series of techniques such as joint text-vision pre-training, zero-vision SFT, and joint text-vision reinforcement learning. Building on this multimodal foundation, K2.5 introduces Agent Swarm, a self-directed parallel agent orchestration framework that dynamically decomposes complex tasks into heterogeneous sub-problems and executes them concurrently. Extensive evalu","claim_type":"abstract","evidence_strength":"source_metadata"},{"claim_text":"spanning two paradigms: Proprietary, including Claude Haiku 4.5 [61], Claude Sonnet 4.6 [62], Gemini 3 Flash Preview [63], Gemini 3.1 Pro Preview [64], GPT-5.4 [65], GPT-5.4 Mini [66], and Grok 4.20 [67]; and Open-weights, including DeepSeek V3.2 [68], GLM 5.1 [69], Kimi K2.5 [70], Llama 3.3 70B Instruct [71], MiniMax M2.7 [72], and Qwen3 30B A3B [73], Qwen3.5 397B A17B [74]. Harnesses.We consider four mainstream harnesses, namely vanilla ReAct [4], Codex [75], Claude Code [76], and OpenClaw [7]","claim_type":"background","confidence":0.7,"evidence_strength":"citation_context"}],"why_cited":"Pith tracks Kimi K2.5: Visual Agentic Intelligence because it crossed a citation-hub threshold. Current citing contexts most often use it as background evidence (1 contexts).","role_counts":[{"n":1,"context_role":"background"}]},"error":null,"updated_at":"2026-05-14T22:26:28.973884+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"ba4c2b89-8e94-4891-8770-f87924f94945","orcid":null,"display_name":"Kimi Team: Tongtong Bai"},{"id":"26ebb414-c56a-4e74-a3b6-6cbf1eaadebc","orcid":null,"display_name":"Yifan Bai"},{"id":"55eaa707-33dc-444a-9cde-f2e59599fb53","orcid":null,"display_name":"Yiping Bao"},{"id":"b730153a-83ef-4c74-99eb-a1cc72067ab7","orcid":null,"display_name":"S.H. Cai"},{"id":"562a40ed-89d1-4b05-9ab9-94452a987eb4","orcid":null,"display_name":"Yuan Cao"},{"id":"76b2fe34-7517-4245-a62c-a41c2c6bf630","orcid":null,"display_name":"Y. Charles"}]},"error":null,"updated_at":"2026-05-14T22:26:30.262903+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T05:46:47.885835+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":37},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":28},{"title":"GLM-5: from Vibe Coding to Agentic Engineering","work_id":"ad29b1a2-bf77-46b3-9ead-fb62b1d2c6fe","shared_citers":26},{"title":"DeepSeek-V3.2: Pushing the Frontier of Open Large Language Models","work_id":"07c85cc5-4086-4abc-823b-6d0f4ff784d0","shared_citers":24},{"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","shared_citers":22},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":22},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":17},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":14},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":14},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":14},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":14},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":11},{"title":"gpt-oss-120b & gpt-oss-20b Model Card","work_id":"178c1f7e-4f19-4392-a45d-45a6dfa88ead","shared_citers":10},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":10},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":9},{"title":"GLM-4.5V and GLM-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning","work_id":"366607ba-e4ea-4726-98c3-63356e32351c","shared_citers":8},{"title":"MiMo-V2-Flash Technical Report","work_id":"1f3df90c-4bc3-49b1-ad9b-7f3b34e4ffba","shared_citers":8},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":8},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":8},{"title":"$\\tau$-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains","work_id":"6a8d8dc4-0cc0-4052-8109-abbcdcd4a962","shared_citers":7},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":7},{"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","work_id":"ea9e51ce-1e75-4182-92d8-4d25f70d2ee4","shared_citers":7},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":7},{"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","work_id":"d0effe15-a689-441a-8e3f-ea35f1c4e4b1","shared_citers":7}],"time_series":[{"n":97,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T05:46:36.920241+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T05:46:51.249600+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"Kimi K2.5: Visual Agentic Intelligence","claims":[{"claim_text":"We introduce Kimi K2.5, an open-source multimodal agentic model designed to advance general agentic intelligence. K2.5 emphasizes the joint optimization of text and vision so that two modalities enhance each other. This includes a series of techniques such as joint text-vision pre-training, zero-vision SFT, and joint text-vision reinforcement learning. Building on this multimodal foundation, K2.5 introduces Agent Swarm, a self-directed parallel agent orchestration framework that dynamically decomposes complex tasks into heterogeneous sub-problems and executes them concurrently. Extensive evalu","claim_type":"abstract","evidence_strength":"source_metadata"},{"claim_text":"spanning two paradigms: Proprietary, including Claude Haiku 4.5 [61], Claude Sonnet 4.6 [62], Gemini 3 Flash Preview [63], Gemini 3.1 Pro Preview [64], GPT-5.4 [65], GPT-5.4 Mini [66], and Grok 4.20 [67]; and Open-weights, including DeepSeek V3.2 [68], GLM 5.1 [69], Kimi K2.5 [70], Llama 3.3 70B Instruct [71], MiniMax M2.7 [72], and Qwen3 30B A3B [73], Qwen3.5 397B A17B [74]. Harnesses.We consider four mainstream harnesses, namely vanilla ReAct [4], Codex [75], Claude Code [76], and OpenClaw [7]","claim_type":"background","confidence":0.7,"evidence_strength":"citation_context"}],"why_cited":"Pith tracks Kimi K2.5: Visual Agentic Intelligence because it crossed a citation-hub threshold. Current citing contexts most often use it as background evidence (1 contexts).","role_counts":[{"n":1,"context_role":"background"}]},"error":null,"updated_at":"2026-05-14T22:26:28.977970+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Kimi K2.5: Visual Agentic Intelligence","claims":[{"claim_text":"We introduce Kimi K2.5, an open-source multimodal agentic model designed to advance general agentic intelligence. K2.5 emphasizes the joint optimization of text and vision so that two modalities enhance each other. This includes a series of techniques such as joint text-vision pre-training, zero-vision SFT, and joint text-vision reinforcement learning. Building on this multimodal foundation, K2.5 introduces Agent Swarm, a self-directed parallel agent orchestration framework that dynamically decomposes complex tasks into heterogeneous sub-problems and executes them concurrently. Extensive evalu","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Kimi K2.5: Visual Agentic Intelligence because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T05:46:54.144012+00:00"}},"summary":{"title":"Kimi K2.5: Visual Agentic Intelligence","claims":[{"claim_text":"We introduce Kimi K2.5, an open-source multimodal agentic model designed to advance general agentic intelligence. K2.5 emphasizes the joint optimization of text and vision so that two modalities enhance each other. This includes a series of techniques such as joint text-vision pre-training, zero-vision SFT, and joint text-vision reinforcement learning. Building on this multimodal foundation, K2.5 introduces Agent Swarm, a self-directed parallel agent orchestration framework that dynamically decomposes complex tasks into heterogeneous sub-problems and executes them concurrently. Extensive evalu","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Kimi K2.5: Visual Agentic Intelligence because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":37},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":28},{"title":"GLM-5: from Vibe Coding to Agentic Engineering","work_id":"ad29b1a2-bf77-46b3-9ead-fb62b1d2c6fe","shared_citers":26},{"title":"DeepSeek-V3.2: Pushing the Frontier of Open Large Language Models","work_id":"07c85cc5-4086-4abc-823b-6d0f4ff784d0","shared_citers":24},{"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","shared_citers":22},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":22},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":17},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":14},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":14},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":14},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":14},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":11},{"title":"gpt-oss-120b & gpt-oss-20b Model Card","work_id":"178c1f7e-4f19-4392-a45d-45a6dfa88ead","shared_citers":10},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":10},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":9},{"title":"GLM-4.5V and GLM-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning","work_id":"366607ba-e4ea-4726-98c3-63356e32351c","shared_citers":8},{"title":"MiMo-V2-Flash Technical Report","work_id":"1f3df90c-4bc3-49b1-ad9b-7f3b34e4ffba","shared_citers":8},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":8},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":8},{"title":"$\\tau$-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains","work_id":"6a8d8dc4-0cc0-4052-8109-abbcdcd4a962","shared_citers":7},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":7},{"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","work_id":"ea9e51ce-1e75-4182-92d8-4d25f70d2ee4","shared_citers":7},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":7},{"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","work_id":"d0effe15-a689-441a-8e3f-ea35f1c4e4b1","shared_citers":7}],"time_series":[{"n":97,"year":2026}],"dependency_candidates":[]},"authors":[{"id":"ba4c2b89-8e94-4891-8770-f87924f94945","orcid":null,"display_name":"Kimi Team: Tongtong Bai","source":"manual","import_confidence":0.72},{"id":"b730153a-83ef-4c74-99eb-a1cc72067ab7","orcid":null,"display_name":"S.H. Cai","source":"manual","import_confidence":0.72},{"id":"76b2fe34-7517-4245-a62c-a41c2c6bf630","orcid":null,"display_name":"Y. Charles","source":"manual","import_confidence":0.72},{"id":"26ebb414-c56a-4e74-a3b6-6cbf1eaadebc","orcid":null,"display_name":"Yifan Bai","source":"manual","import_confidence":0.72},{"id":"55eaa707-33dc-444a-9cde-f2e59599fb53","orcid":null,"display_name":"Yiping Bao","source":"manual","import_confidence":0.72},{"id":"562a40ed-89d1-4b05-9ab9-94452a987eb4","orcid":null,"display_name":"Yuan Cao","source":"manual","import_confidence":0.72}]}}