{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:STZE3XGYUA5FI64VOLYHDLMIWB","short_pith_number":"pith:STZE3XGY","canonical_record":{"source":{"id":"2508.05748","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2025-08-07T18:03:50Z","cross_cats_sorted":[],"title_canon_sha256":"a543c002b68a22ea3cccb801774aeff5d9c3a7cd3a2ef1ba117c6e419776e988","abstract_canon_sha256":"e5f2ae3615b247e22deaa32da02a6ac383263c0d2ad78dace4e467850ce21504"},"schema_version":"1.0"},"canonical_sha256":"94f24ddcd8a03a547b9572f071ad88b064a7504c02a0adb1f23fbe038cec5ac2","source":{"kind":"arxiv","id":"2508.05748","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2508.05748","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2508.05748v3","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2508.05748","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"STZE3XGYUA5F","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"STZE3XGYUA5FI64V","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"STZE3XGY","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:STZE3XGYUA5FI64VOLYHDLMIWB","target":"record","payload":{"canonical_record":{"source":{"id":"2508.05748","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2025-08-07T18:03:50Z","cross_cats_sorted":[],"title_canon_sha256":"a543c002b68a22ea3cccb801774aeff5d9c3a7cd3a2ef1ba117c6e419776e988","abstract_canon_sha256":"e5f2ae3615b247e22deaa32da02a6ac383263c0d2ad78dace4e467850ce21504"},"schema_version":"1.0"},"canonical_sha256":"94f24ddcd8a03a547b9572f071ad88b064a7504c02a0adb1f23fbe038cec5ac2","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:50.510335Z","signature_b64":"rEASVho+LW+yev6fT7vfCjjrLX516s8BEJg+h+p9BwDlyFjqRe+VxmlmgiVLj5khgMOM/lifmWZBEbwjok+RAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"94f24ddcd8a03a547b9572f071ad88b064a7504c02a0adb1f23fbe038cec5ac2","last_reissued_at":"2026-05-17T23:38:50.509905Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:50.509905Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2508.05748","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"bcmaz7ciEg/XOcclnUUY8bppZBtqJO6gMN6RWMFLDhtjMAEgxpE3FYwMijJ9z/sSPvuEQfphXgPz6ShYt9nIDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T19:49:22.481442Z"},"content_sha256":"f69e50955e2f51e19410ac6f57ea1fbe58ee56a1e9fee69730da95b33d43b5c8","schema_version":"1.0","event_id":"sha256:f69e50955e2f51e19410ac6f57ea1fbe58ee56a1e9fee69730da95b33d43b5c8"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:STZE3XGYUA5FI64VOLYHDLMIWB","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"WebWatcher: Breaking New Frontier of Vision-Language Deep Research Agent","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"WebWatcher trains a vision-language agent on synthetic multimodal trajectories and reinforcement learning to outperform baselines on complex VQA tasks.","cross_cats":[],"primary_cat":"cs.IR","authors_text":"Chenxi Wang, Fei Huang, Jialong Wu, Jingren Zhou, Kuan Li, Pengjun Xie, Peng Xia, Qiuchen Wang, Ruixue Ding, Xinyu Geng, Xinyu Wang, Yida Zhao, Yong Jiang, Zhen Zhang","submitted_at":"2025-08-07T18:03:50Z","abstract_excerpt":"Web agents such as Deep Research have demonstrated superhuman cognitive abilities, capable of solving highly challenging information-seeking problems. However, most research remains primarily text-centric, overlooking visual information in the real world. This makes multimodal Deep Research highly challenging, as such agents require much stronger reasoning abilities in perception, logic, knowledge, and the use of more sophisticated tools compared to text-based agents. To address this limitation, we introduce WebWatcher, a multi-modal Agent for Deep Research equipped with enhanced visual-langua"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experimental results show that WebWatcher significantly outperforms proprietary baseline, RAG workflow and open-source agents in four challenging VQA benchmarks, which paves the way for solving complex multimodal information-seeking tasks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That high-quality synthetic multimodal trajectories enable efficient cold start training for agents requiring stronger reasoning in perception, logic, knowledge, and that reinforcement learning further enhances generalization to complex tasks.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"WebWatcher introduces a vision-language deep research agent trained on synthetic multimodal trajectories and RL that outperforms baselines on VQA benchmarks, along with a new BrowseComp-VL evaluation.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"WebWatcher trains a vision-language agent on synthetic multimodal trajectories and reinforcement learning to outperform baselines on complex VQA tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"22ee8a2002767720a94e9009d3be2c17acd2b73820827819092101246b76f308"},"source":{"id":"2508.05748","kind":"arxiv","version":3},"verdict":{"id":"26567cb1-5767-41dd-a7e2-bd7bde24e2af","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T18:53:03.016124Z","strongest_claim":"Experimental results show that WebWatcher significantly outperforms proprietary baseline, RAG workflow and open-source agents in four challenging VQA benchmarks, which paves the way for solving complex multimodal information-seeking tasks.","one_line_summary":"WebWatcher introduces a vision-language deep research agent trained on synthetic multimodal trajectories and RL that outperforms baselines on VQA benchmarks, along with a new BrowseComp-VL evaluation.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That high-quality synthetic multimodal trajectories enable efficient cold start training for agents requiring stronger reasoning in perception, logic, knowledge, and that reinforcement learning further enhances generalization to complex tasks.","pith_extraction_headline":"WebWatcher trains a vision-language agent on synthetic multimodal trajectories and reinforcement learning to outperform baselines on complex VQA tasks."},"references":{"count":31,"sample":[{"doi":"","year":null,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","ref_index":1,"cited_arxiv_id":"2502.13923","is_internal_anchor":true},{"doi":"","year":null,"title":"Why reasoning matters? a survey of advancements in multimodal reasoning (v1)","work_id":"776ae2d4-b7ef-445f-be80-6568ff81d28e","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","ref_index":3,"cited_arxiv_id":"2107.03374","is_internal_anchor":true},{"doi":"","year":null,"title":"M3 cot: A novel benchmark for multi- domain multi-step multi-modal chain-of-thought","work_id":"4d99c33a-262f-4545-baae-925205f5b2bc","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"arXiv preprint arXiv:2302.11713 , year=","work_id":"b7899c66-9ee1-45bf-8a1d-223b8959dee6","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":31,"snapshot_sha256":"76492630528aa15e1a63ce6c7c470f818a70fd03e06a51739bacc263ea189853","internal_anchors":11},"formal_canon":{"evidence_count":2,"snapshot_sha256":"5a401a4a930993de4bc33549fba4b44051404b0a8b11f11f265c0734f743c0e0"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"26567cb1-5767-41dd-a7e2-bd7bde24e2af"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"jzvtUBGqeERVzhmCQ6NAZHvvJmit/qjX6S/54/0zZLINp8cQaI+aLECZ9FvqFL8uULrro9bxbZtfpi9OrxsWDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T19:49:22.482456Z"},"content_sha256":"9864cf3d3747c7ab3e0372e65273159c4d3a30b857402b07df418bcd5ddbf34c","schema_version":"1.0","event_id":"sha256:9864cf3d3747c7ab3e0372e65273159c4d3a30b857402b07df418bcd5ddbf34c"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/STZE3XGYUA5FI64VOLYHDLMIWB/bundle.json","state_url":"https://pith.science/pith/STZE3XGYUA5FI64VOLYHDLMIWB/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/STZE3XGYUA5FI64VOLYHDLMIWB/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T19:49:22Z","links":{"resolver":"https://pith.science/pith/STZE3XGYUA5FI64VOLYHDLMIWB","bundle":"https://pith.science/pith/STZE3XGYUA5FI64VOLYHDLMIWB/bundle.json","state":"https://pith.science/pith/STZE3XGYUA5FI64VOLYHDLMIWB/state.json","well_known_bundle":"https://pith.science/.well-known/pith/STZE3XGYUA5FI64VOLYHDLMIWB/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:STZE3XGYUA5FI64VOLYHDLMIWB","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"e5f2ae3615b247e22deaa32da02a6ac383263c0d2ad78dace4e467850ce21504","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2025-08-07T18:03:50Z","title_canon_sha256":"a543c002b68a22ea3cccb801774aeff5d9c3a7cd3a2ef1ba117c6e419776e988"},"schema_version":"1.0","source":{"id":"2508.05748","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2508.05748","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2508.05748v3","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2508.05748","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"STZE3XGYUA5F","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"STZE3XGYUA5FI64V","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"STZE3XGY","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:9864cf3d3747c7ab3e0372e65273159c4d3a30b857402b07df418bcd5ddbf34c","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experimental results show that WebWatcher significantly outperforms proprietary baseline, RAG workflow and open-source agents in four challenging VQA benchmarks, which paves the way for solving complex multimodal information-seeking tasks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That high-quality synthetic multimodal trajectories enable efficient cold start training for agents requiring stronger reasoning in perception, logic, knowledge, and that reinforcement learning further enhances generalization to complex tasks."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"WebWatcher introduces a vision-language deep research agent trained on synthetic multimodal trajectories and RL that outperforms baselines on VQA benchmarks, along with a new BrowseComp-VL evaluation."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"WebWatcher trains a vision-language agent on synthetic multimodal trajectories and reinforcement learning to outperform baselines on complex VQA tasks."}],"snapshot_sha256":"22ee8a2002767720a94e9009d3be2c17acd2b73820827819092101246b76f308"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"5a401a4a930993de4bc33549fba4b44051404b0a8b11f11f265c0734f743c0e0"},"paper":{"abstract_excerpt":"Web agents such as Deep Research have demonstrated superhuman cognitive abilities, capable of solving highly challenging information-seeking problems. However, most research remains primarily text-centric, overlooking visual information in the real world. This makes multimodal Deep Research highly challenging, as such agents require much stronger reasoning abilities in perception, logic, knowledge, and the use of more sophisticated tools compared to text-based agents. To address this limitation, we introduce WebWatcher, a multi-modal Agent for Deep Research equipped with enhanced visual-langua","authors_text":"Chenxi Wang, Fei Huang, Jialong Wu, Jingren Zhou, Kuan Li, Pengjun Xie, Peng Xia, Qiuchen Wang, Ruixue Ding, Xinyu Geng, Xinyu Wang, Yida Zhao, Yong Jiang, Zhen Zhang","cross_cats":[],"headline":"WebWatcher trains a vision-language agent on synthetic multimodal trajectories and reinforcement learning to outperform baselines on complex VQA tasks.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2025-08-07T18:03:50Z","title":"WebWatcher: Breaking New Frontier of Vision-Language Deep Research Agent"},"references":{"count":31,"internal_anchors":11,"resolved_work":31,"sample":[{"cited_arxiv_id":"2502.13923","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Why reasoning matters? a survey of advancements in multimodal reasoning (v1)","work_id":"776ae2d4-b7ef-445f-be80-6568ff81d28e","year":null},{"cited_arxiv_id":"2107.03374","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"M3 cot: A novel benchmark for multi- domain multi-step multi-modal chain-of-thought","work_id":"4d99c33a-262f-4545-baae-925205f5b2bc","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"arXiv preprint arXiv:2302.11713 , year=","work_id":"b7899c66-9ee1-45bf-8a1d-223b8959dee6","year":null}],"snapshot_sha256":"76492630528aa15e1a63ce6c7c470f818a70fd03e06a51739bacc263ea189853"},"source":{"id":"2508.05748","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-15T18:53:03.016124Z","id":"26567cb1-5767-41dd-a7e2-bd7bde24e2af","model_set":{"reader":"grok-4.3"},"one_line_summary":"WebWatcher introduces a vision-language deep research agent trained on synthetic multimodal trajectories and RL that outperforms baselines on VQA benchmarks, along with a new BrowseComp-VL evaluation.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"WebWatcher trains a vision-language agent on synthetic multimodal trajectories and reinforcement learning to outperform baselines on complex VQA tasks.","strongest_claim":"Experimental results show that WebWatcher significantly outperforms proprietary baseline, RAG workflow and open-source agents in four challenging VQA benchmarks, which paves the way for solving complex multimodal information-seeking tasks.","weakest_assumption":"That high-quality synthetic multimodal trajectories enable efficient cold start training for agents requiring stronger reasoning in perception, logic, knowledge, and that reinforcement learning further enhances generalization to complex tasks."}},"verdict_id":"26567cb1-5767-41dd-a7e2-bd7bde24e2af"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f69e50955e2f51e19410ac6f57ea1fbe58ee56a1e9fee69730da95b33d43b5c8","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"e5f2ae3615b247e22deaa32da02a6ac383263c0d2ad78dace4e467850ce21504","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2025-08-07T18:03:50Z","title_canon_sha256":"a543c002b68a22ea3cccb801774aeff5d9c3a7cd3a2ef1ba117c6e419776e988"},"schema_version":"1.0","source":{"id":"2508.05748","kind":"arxiv","version":3}},"canonical_sha256":"94f24ddcd8a03a547b9572f071ad88b064a7504c02a0adb1f23fbe038cec5ac2","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"94f24ddcd8a03a547b9572f071ad88b064a7504c02a0adb1f23fbe038cec5ac2","first_computed_at":"2026-05-17T23:38:50.509905Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.509905Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"rEASVho+LW+yev6fT7vfCjjrLX516s8BEJg+h+p9BwDlyFjqRe+VxmlmgiVLj5khgMOM/lifmWZBEbwjok+RAQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.510335Z","signed_message":"canonical_sha256_bytes"},"source_id":"2508.05748","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f69e50955e2f51e19410ac6f57ea1fbe58ee56a1e9fee69730da95b33d43b5c8","sha256:9864cf3d3747c7ab3e0372e65273159c4d3a30b857402b07df418bcd5ddbf34c"],"state_sha256":"15dae8e8bc304be3e596dcf30689b25ea72f2afcebfe987ea864d951e2dbc78a"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"LTRMT24v0RCRyjVmSCocG4TXhaZ6QDesDbtg/YAovBgdOWnkHVvPaqtq2frnT3hrMBJ9R8dbNwm5S+t3u599Cg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T19:49:22.487102Z","bundle_sha256":"c4a964145defeffae395cae241207f238a053ba82b7c3402d1addf76cd7ad151"}}