{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:LML3LZTZJJXZUHKBLKYI6CQESO","short_pith_number":"pith:LML3LZTZ","canonical_record":{"source":{"id":"2511.05271","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-11-07T14:31:20Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"325f93af9a19209286686e8dea34e0546cb13ab9906fe36e5b42a7e0481e41aa","abstract_canon_sha256":"9f900df79673713cdc9187b7f1bf588d01115ec60ea5fc895565966b8b08796d"},"schema_version":"1.0"},"canonical_sha256":"5b17b5e6794a6f9a1d415ab08f0a0493a5c232e001bdb388f73d6c202b02cbe2","source":{"kind":"arxiv","id":"2511.05271","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2511.05271","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2511.05271v4","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.05271","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"LML3LZTZJJXZ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"LML3LZTZJJXZUHKB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"LML3LZTZ","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:LML3LZTZJJXZUHKBLKYI6CQESO","target":"record","payload":{"canonical_record":{"source":{"id":"2511.05271","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-11-07T14:31:20Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"325f93af9a19209286686e8dea34e0546cb13ab9906fe36e5b42a7e0481e41aa","abstract_canon_sha256":"9f900df79673713cdc9187b7f1bf588d01115ec60ea5fc895565966b8b08796d"},"schema_version":"1.0"},"canonical_sha256":"5b17b5e6794a6f9a1d415ab08f0a0493a5c232e001bdb388f73d6c202b02cbe2","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:48.964532Z","signature_b64":"GCcA58ZwBH/iuur5Q9nYDCvCdzP5LuvWg1dMHntURKRsOUuSetsLgjrXpcdnPV1O7cpLUWb7GR2TpAfSM3EFBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5b17b5e6794a6f9a1d415ab08f0a0493a5c232e001bdb388f73d6c202b02cbe2","last_reissued_at":"2026-05-17T23:38:48.964001Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:48.964001Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2511.05271","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"jI7ENYiMLU2cIQrp5/jos2xpMGx7gDjEAwdl7Qb69M4jEFkAaRROP2FX9B9+YEstzxHzzBvR6ZaMBqz/+FnPDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-04T06:11:23.306857Z"},"content_sha256":"faddc16a84650325e7eae7e7be72a542e14b86797d59ed56e27ae9c9eb81be60","schema_version":"1.0","event_id":"sha256:faddc16a84650325e7eae7e7be72a542e14b86797d59ed56e27ae9c9eb81be60"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:LML3LZTZJJXZUHKBLKYI6CQESO","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"DeepEyesV2: Toward Agentic Multimodal Model","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A two-stage cold-start followed by reinforcement learning induces robust tool-use behavior in multimodal models where direct reinforcement learning fails.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"ChengLin Zhu, Chenxiao Zhao, Guohai Xu, Jack Hong, Weiheng Lu, Xing Yu","submitted_at":"2025-11-07T14:31:20Z","abstract_excerpt":"Agentic multimodal models should not only comprehend text and images, but also actively invoke external tools, such as code execution environments and web search, and integrate these operations into reasoning. In this work, we introduce DeepEyesV2 and explore how to build an agentic multimodal model from the perspectives of data construction, training methods, and model evaluation. We observe that direct reinforcement learning alone fails to induce robust tool-use behavior. This phenomenon motivates a two-stage training pipeline: a cold-start stage to establish tool-use patterns, and reinforce"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Direct reinforcement learning alone fails to induce robust tool-use behavior, but a two-stage pipeline of cold-start followed by reinforcement learning produces task-adaptive tool invocation and strong performance on RealX-Bench and other benchmarks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the moderately challenging curated dataset containing examples where tool use is beneficial will produce generalizable tool-use patterns that transfer beyond the specific tasks and benchmarks reported.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"DeepEyesV2 uses a two-stage cold-start plus reinforcement learning pipeline to produce an agentic multimodal model that adaptively invokes tools and outperforms direct RL on real-world reasoning benchmarks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A two-stage cold-start followed by reinforcement learning induces robust tool-use behavior in multimodal models where direct reinforcement learning fails.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"f2fb57bed7ce591ba75228be6362eff6bae41ee94bcf2f7a1cc492ad85f26f67"},"source":{"id":"2511.05271","kind":"arxiv","version":4},"verdict":{"id":"febda460-2eb5-4a8a-8167-6b6508ffb318","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T05:28:40.015091Z","strongest_claim":"Direct reinforcement learning alone fails to induce robust tool-use behavior, but a two-stage pipeline of cold-start followed by reinforcement learning produces task-adaptive tool invocation and strong performance on RealX-Bench and other benchmarks.","one_line_summary":"DeepEyesV2 uses a two-stage cold-start plus reinforcement learning pipeline to produce an agentic multimodal model that adaptively invokes tools and outperforms direct RL on real-world reasoning benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the moderately challenging curated dataset containing examples where tool use is beneficial will produce generalizable tool-use patterns that transfer beyond the specific tasks and benchmarks reported.","pith_extraction_headline":"A two-stage cold-start followed by reinforcement learning induces robust tool-use behavior in multimodal models where direct reinforcement learning fails."},"references":{"count":78,"sample":[{"doi":"","year":2019,"title":"Tallyqa: Answering complex counting questions","work_id":"bfee5365-9d55-4923-a6e6-fe307549fe21","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Anthropic. Claude 4. https://www.anthropic.com/news/claude-4, 2025","work_id":"56b739f4-07b1-4fd3-b13a-c6d23ecb99d5","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","ref_index":3,"cited_arxiv_id":"2309.16609","is_internal_anchor":true},{"doi":"","year":2025,"title":"Qwen2.5-vl: A family of vision-language models from 7b to 72b.arXiv preprint arXiv:2502.04567, 2025","work_id":"a5d690e0-8cac-4ebd-a95f-6f78db82c2de","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"SFT or RL? An Early Investigation into Training R1-Like Reasoning Large Vision-Language Models","work_id":"a521360c-8673-4d0d-a3a3-6eb9f7a71b90","ref_index":5,"cited_arxiv_id":"2504.11468","is_internal_anchor":true}],"resolved_work":78,"snapshot_sha256":"59d1495cc573fad5a53c4c44518191dd179568560654ee2a1e8524771ef45df4","internal_anchors":27},"formal_canon":{"evidence_count":3,"snapshot_sha256":"c30019b8c6ac0a42071dbfc9b9342fb880ec60aa233fc4eeae74c27f5698377e"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"febda460-2eb5-4a8a-8167-6b6508ffb318"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"UMz7x86WN1J3tPwk09T/KCXPIReadXLcKfydUQgDlBNzyKJYp643RMJej92BXHaN0+o4W+Z1jiXNmAdw6cGGDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-04T06:11:23.307824Z"},"content_sha256":"0de0696ddcdcd72377fd60af953722e5894f127f5d02d8afb84842319f8a0898","schema_version":"1.0","event_id":"sha256:0de0696ddcdcd72377fd60af953722e5894f127f5d02d8afb84842319f8a0898"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/LML3LZTZJJXZUHKBLKYI6CQESO/bundle.json","state_url":"https://pith.science/pith/LML3LZTZJJXZUHKBLKYI6CQESO/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/LML3LZTZJJXZUHKBLKYI6CQESO/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-04T06:11:23Z","links":{"resolver":"https://pith.science/pith/LML3LZTZJJXZUHKBLKYI6CQESO","bundle":"https://pith.science/pith/LML3LZTZJJXZUHKBLKYI6CQESO/bundle.json","state":"https://pith.science/pith/LML3LZTZJJXZUHKBLKYI6CQESO/state.json","well_known_bundle":"https://pith.science/.well-known/pith/LML3LZTZJJXZUHKBLKYI6CQESO/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:LML3LZTZJJXZUHKBLKYI6CQESO","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"9f900df79673713cdc9187b7f1bf588d01115ec60ea5fc895565966b8b08796d","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-11-07T14:31:20Z","title_canon_sha256":"325f93af9a19209286686e8dea34e0546cb13ab9906fe36e5b42a7e0481e41aa"},"schema_version":"1.0","source":{"id":"2511.05271","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2511.05271","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2511.05271v4","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.05271","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"LML3LZTZJJXZ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"LML3LZTZJJXZUHKB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"LML3LZTZ","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:0de0696ddcdcd72377fd60af953722e5894f127f5d02d8afb84842319f8a0898","target":"graph","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Direct reinforcement learning alone fails to induce robust tool-use behavior, but a two-stage pipeline of cold-start followed by reinforcement learning produces task-adaptive tool invocation and strong performance on RealX-Bench and other benchmarks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the moderately challenging curated dataset containing examples where tool use is beneficial will produce generalizable tool-use patterns that transfer beyond the specific tasks and benchmarks reported."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"DeepEyesV2 uses a two-stage cold-start plus reinforcement learning pipeline to produce an agentic multimodal model that adaptively invokes tools and outperforms direct RL on real-world reasoning benchmarks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A two-stage cold-start followed by reinforcement learning induces robust tool-use behavior in multimodal models where direct reinforcement learning fails."}],"snapshot_sha256":"f2fb57bed7ce591ba75228be6362eff6bae41ee94bcf2f7a1cc492ad85f26f67"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"c30019b8c6ac0a42071dbfc9b9342fb880ec60aa233fc4eeae74c27f5698377e"},"paper":{"abstract_excerpt":"Agentic multimodal models should not only comprehend text and images, but also actively invoke external tools, such as code execution environments and web search, and integrate these operations into reasoning. In this work, we introduce DeepEyesV2 and explore how to build an agentic multimodal model from the perspectives of data construction, training methods, and model evaluation. We observe that direct reinforcement learning alone fails to induce robust tool-use behavior. This phenomenon motivates a two-stage training pipeline: a cold-start stage to establish tool-use patterns, and reinforce","authors_text":"ChengLin Zhu, Chenxiao Zhao, Guohai Xu, Jack Hong, Weiheng Lu, Xing Yu","cross_cats":["cs.AI"],"headline":"A two-stage cold-start followed by reinforcement learning induces robust tool-use behavior in multimodal models where direct reinforcement learning fails.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-11-07T14:31:20Z","title":"DeepEyesV2: Toward Agentic Multimodal Model"},"references":{"count":78,"internal_anchors":27,"resolved_work":78,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Tallyqa: Answering complex counting questions","work_id":"bfee5365-9d55-4923-a6e6-fe307549fe21","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Anthropic. Claude 4. https://www.anthropic.com/news/claude-4, 2025","work_id":"56b739f4-07b1-4fd3-b13a-c6d23ecb99d5","year":2025},{"cited_arxiv_id":"2309.16609","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Qwen2.5-vl: A family of vision-language models from 7b to 72b.arXiv preprint arXiv:2502.04567, 2025","work_id":"a5d690e0-8cac-4ebd-a95f-6f78db82c2de","year":2025},{"cited_arxiv_id":"2504.11468","doi":"","is_internal_anchor":true,"ref_index":5,"title":"SFT or RL? An Early Investigation into Training R1-Like Reasoning Large Vision-Language Models","work_id":"a521360c-8673-4d0d-a3a3-6eb9f7a71b90","year":2025}],"snapshot_sha256":"59d1495cc573fad5a53c4c44518191dd179568560654ee2a1e8524771ef45df4"},"source":{"id":"2511.05271","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-16T05:28:40.015091Z","id":"febda460-2eb5-4a8a-8167-6b6508ffb318","model_set":{"reader":"grok-4.3"},"one_line_summary":"DeepEyesV2 uses a two-stage cold-start plus reinforcement learning pipeline to produce an agentic multimodal model that adaptively invokes tools and outperforms direct RL on real-world reasoning benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A two-stage cold-start followed by reinforcement learning induces robust tool-use behavior in multimodal models where direct reinforcement learning fails.","strongest_claim":"Direct reinforcement learning alone fails to induce robust tool-use behavior, but a two-stage pipeline of cold-start followed by reinforcement learning produces task-adaptive tool invocation and strong performance on RealX-Bench and other benchmarks.","weakest_assumption":"That the moderately challenging curated dataset containing examples where tool use is beneficial will produce generalizable tool-use patterns that transfer beyond the specific tasks and benchmarks reported."}},"verdict_id":"febda460-2eb5-4a8a-8167-6b6508ffb318"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:faddc16a84650325e7eae7e7be72a542e14b86797d59ed56e27ae9c9eb81be60","target":"record","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"9f900df79673713cdc9187b7f1bf588d01115ec60ea5fc895565966b8b08796d","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-11-07T14:31:20Z","title_canon_sha256":"325f93af9a19209286686e8dea34e0546cb13ab9906fe36e5b42a7e0481e41aa"},"schema_version":"1.0","source":{"id":"2511.05271","kind":"arxiv","version":4}},"canonical_sha256":"5b17b5e6794a6f9a1d415ab08f0a0493a5c232e001bdb388f73d6c202b02cbe2","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"5b17b5e6794a6f9a1d415ab08f0a0493a5c232e001bdb388f73d6c202b02cbe2","first_computed_at":"2026-05-17T23:38:48.964001Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:48.964001Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"GCcA58ZwBH/iuur5Q9nYDCvCdzP5LuvWg1dMHntURKRsOUuSetsLgjrXpcdnPV1O7cpLUWb7GR2TpAfSM3EFBw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:48.964532Z","signed_message":"canonical_sha256_bytes"},"source_id":"2511.05271","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:faddc16a84650325e7eae7e7be72a542e14b86797d59ed56e27ae9c9eb81be60","sha256:0de0696ddcdcd72377fd60af953722e5894f127f5d02d8afb84842319f8a0898"],"state_sha256":"5dac06972fa4dee9ca96cae68e963fa442ead14fa19de49ab74994d45ef70c9f"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"PUUIrrCjwDdQ/q12aznzrhJZ5/Oard1m5sE0Qc8j2XPpxmcOI4HSSY2jkkIAu1LHorH9ZE3senpIyrqT+9zeBQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-04T06:11:23.311568Z","bundle_sha256":"653f3c9ab851edaefe82b987b99b2718da3d5e9dc32626d52a936f66a02b961e"}}