{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:Q2GQS7TVDKWQC6IZHFI6RQ3SDW","short_pith_number":"pith:Q2GQS7TV","canonical_record":{"source":{"id":"2401.13919","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-01-25T03:33:18Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"420fdf3b4a312821fda1b59f2bce724a229f966cb33653099f230c8a6b9c32b9","abstract_canon_sha256":"6702bf694a1a72778b94438e24cf82d3027090c9f99ae685bb236c7044f0866d"},"schema_version":"1.0"},"canonical_sha256":"868d097e751aad0179193951e8c3721db835422f8f153a33260cf0509d42be8f","source":{"kind":"arxiv","id":"2401.13919","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2401.13919","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2401.13919v4","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.13919","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"Q2GQS7TVDKWQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"Q2GQS7TVDKWQC6IZ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"Q2GQS7TV","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:Q2GQS7TVDKWQC6IZHFI6RQ3SDW","target":"record","payload":{"canonical_record":{"source":{"id":"2401.13919","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-01-25T03:33:18Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"420fdf3b4a312821fda1b59f2bce724a229f966cb33653099f230c8a6b9c32b9","abstract_canon_sha256":"6702bf694a1a72778b94438e24cf82d3027090c9f99ae685bb236c7044f0866d"},"schema_version":"1.0"},"canonical_sha256":"868d097e751aad0179193951e8c3721db835422f8f153a33260cf0509d42be8f","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.893688Z","signature_b64":"PFM8aLW68N0Dnsj36bsYMT3YZDwMk0IstIrE9BIy6Y5QxOVHIY7o/ditKPv2rtivfIINilpzThAEW7PEaQklCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"868d097e751aad0179193951e8c3721db835422f8f153a33260cf0509d42be8f","last_reissued_at":"2026-05-17T23:38:49.893202Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.893202Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2401.13919","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"pRvjy6pOoBFqBPc0uim3abbOCsUld+lzfDKGgOreuyZql8+UzGaVa1KweGyJvrfqxa+gUSb9HlLXVMDy8D+dAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-08T12:43:02.925851Z"},"content_sha256":"39c6d14c8bab43ed26d9e219805a78c40c12705529a2a629a63033af5f6237f1","schema_version":"1.0","event_id":"sha256:39c6d14c8bab43ed26d9e219805a78c40c12705529a2a629a63033af5f6237f1"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:Q2GQS7TVDKWQC6IZHFI6RQ3SDW","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"WebVoyager shows that large multimodal models can drive an end-to-end agent that completes open-ended tasks on live websites.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Dong Yu, Hongliang He, Hongming Zhang, Kaixin Ma, Wenhao Yu, Wenlin Yao, Yong Dai, Zhenzhong Lan","submitted_at":"2024-01-25T03:33:18Z","abstract_excerpt":"The rapid advancement of large language models (LLMs) has led to a new era marked by the development of autonomous applications in real-world scenarios, which drives innovation in creating advanced web agents. Existing web agents typically only handle one input modality and are evaluated only in simplified web simulators or static web snapshots, greatly limiting their applicability in real-world scenarios. To bridge this gap, we introduce WebVoyager, an innovative Large Multimodal Model (LMM) powered web agent that can complete user instructions end-to-end by interacting with real-world websit"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"WebVoyager achieves a 59.1% task success rate on our benchmark, significantly surpassing the performance of both GPT-4 (All Tools) and the WebVoyager (text-only) setups.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The automatic evaluation protocol using GPT-4V multimodal understanding accurately reflects human judgment of task completion on real websites.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"WebVoyager uses a large multimodal model to complete real-world web tasks end-to-end and reaches 59.1 percent success on a new benchmark of 15 live sites, with an automatic GPT-4V evaluator that matches human judgments 85 percent of the time.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"WebVoyager shows that large multimodal models can drive an end-to-end agent that completes open-ended tasks on live websites.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"ed412a925a4ab5d63e26afbdaaeaad352404cda0ab9ee0a4f88ca94a5fac7be7"},"source":{"id":"2401.13919","kind":"arxiv","version":4},"verdict":{"id":"df4da1e6-a2a6-4d8f-a88d-4d16a1f6ada9","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T22:40:28.332635Z","strongest_claim":"WebVoyager achieves a 59.1% task success rate on our benchmark, significantly surpassing the performance of both GPT-4 (All Tools) and the WebVoyager (text-only) setups.","one_line_summary":"WebVoyager uses a large multimodal model to complete real-world web tasks end-to-end and reaches 59.1 percent success on a new benchmark of 15 live sites, with an automatic GPT-4V evaluator that matches human judgments 85 percent of the time.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The automatic evaluation protocol using GPT-4V multimodal understanding accurately reflects human judgment of task completion on real websites.","pith_extraction_headline":"WebVoyager shows that large multimodal models can drive an end-to-end agent that completes open-ended tasks on live websites."},"references":{"count":13,"sample":[{"doi":"","year":2022,"title":"Mind2Web: Towards a Generalist Agent for the Web","work_id":"e26f5a00-c007-439d-83f6-7900f5687b6b","ref_index":1,"cited_arxiv_id":"2306.06070","is_internal_anchor":true},{"doi":"","year":2021,"title":"GAIA: a benchmark for General AI Assistants","work_id":"cf222b33-f7a3-4044-a570-ecfe25edb3f8","ref_index":2,"cited_arxiv_id":"2311.12983","is_internal_anchor":true},{"doi":"","year":2023,"title":"ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs","work_id":"3c555b48-a4d9-42dd-9fdd-0f6018fbe9cb","ref_index":3,"cited_arxiv_id":"2307.16789","is_internal_anchor":true},{"doi":"","year":null,"title":"E Additional Related Work Vision-based Agents Concurrent to our work, a few related works also studied vision-based au- tonomous agents","work_id":"46e0c71d-42d1-4988-8e0a-eb953f20364a","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"SeeClick (Cheng et al., 2024) focused on finetuning an LMM to solely leverage screenshots as inputs to interact Imagine you are a robot browsing the web, just like humans","work_id":"9eac54cc-e420-4e66-97ad-e6c50c1fbb34","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":13,"snapshot_sha256":"93c8a8653f79df6d2cea72bfd59789e0d33f2ccd8256022d05ee2cb5fa2e8bc2","internal_anchors":3},"formal_canon":{"evidence_count":3,"snapshot_sha256":"3747f995a3e55740245cd65aeb7cefb92ce405e91eab702fd0dd7de3b3fae0c4"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"df4da1e6-a2a6-4d8f-a88d-4d16a1f6ada9"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"EvlxYY0epiQK60VyYOnEfi8jsueHILqZUMC1gJet7Ol8pVRriAgvdx5MFOUQNvF3wVdPO+DUwL20/VIqu5Q1CA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-08T12:43:02.926782Z"},"content_sha256":"1a1807cf205b4e72795b47cd9e788525b1fa1642aa9b70692adc2ae8917b7fd0","schema_version":"1.0","event_id":"sha256:1a1807cf205b4e72795b47cd9e788525b1fa1642aa9b70692adc2ae8917b7fd0"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/Q2GQS7TVDKWQC6IZHFI6RQ3SDW/bundle.json","state_url":"https://pith.science/pith/Q2GQS7TVDKWQC6IZHFI6RQ3SDW/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/Q2GQS7TVDKWQC6IZHFI6RQ3SDW/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-08T12:43:02Z","links":{"resolver":"https://pith.science/pith/Q2GQS7TVDKWQC6IZHFI6RQ3SDW","bundle":"https://pith.science/pith/Q2GQS7TVDKWQC6IZHFI6RQ3SDW/bundle.json","state":"https://pith.science/pith/Q2GQS7TVDKWQC6IZHFI6RQ3SDW/state.json","well_known_bundle":"https://pith.science/.well-known/pith/Q2GQS7TVDKWQC6IZHFI6RQ3SDW/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:Q2GQS7TVDKWQC6IZHFI6RQ3SDW","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"6702bf694a1a72778b94438e24cf82d3027090c9f99ae685bb236c7044f0866d","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-01-25T03:33:18Z","title_canon_sha256":"420fdf3b4a312821fda1b59f2bce724a229f966cb33653099f230c8a6b9c32b9"},"schema_version":"1.0","source":{"id":"2401.13919","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2401.13919","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2401.13919v4","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.13919","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"Q2GQS7TVDKWQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"Q2GQS7TVDKWQC6IZ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"Q2GQS7TV","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:1a1807cf205b4e72795b47cd9e788525b1fa1642aa9b70692adc2ae8917b7fd0","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"WebVoyager achieves a 59.1% task success rate on our benchmark, significantly surpassing the performance of both GPT-4 (All Tools) and the WebVoyager (text-only) setups."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The automatic evaluation protocol using GPT-4V multimodal understanding accurately reflects human judgment of task completion on real websites."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"WebVoyager uses a large multimodal model to complete real-world web tasks end-to-end and reaches 59.1 percent success on a new benchmark of 15 live sites, with an automatic GPT-4V evaluator that matches human judgments 85 percent of the time."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"WebVoyager shows that large multimodal models can drive an end-to-end agent that completes open-ended tasks on live websites."}],"snapshot_sha256":"ed412a925a4ab5d63e26afbdaaeaad352404cda0ab9ee0a4f88ca94a5fac7be7"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"3747f995a3e55740245cd65aeb7cefb92ce405e91eab702fd0dd7de3b3fae0c4"},"paper":{"abstract_excerpt":"The rapid advancement of large language models (LLMs) has led to a new era marked by the development of autonomous applications in real-world scenarios, which drives innovation in creating advanced web agents. Existing web agents typically only handle one input modality and are evaluated only in simplified web simulators or static web snapshots, greatly limiting their applicability in real-world scenarios. To bridge this gap, we introduce WebVoyager, an innovative Large Multimodal Model (LMM) powered web agent that can complete user instructions end-to-end by interacting with real-world websit","authors_text":"Dong Yu, Hongliang He, Hongming Zhang, Kaixin Ma, Wenhao Yu, Wenlin Yao, Yong Dai, Zhenzhong Lan","cross_cats":["cs.AI"],"headline":"WebVoyager shows that large multimodal models can drive an end-to-end agent that completes open-ended tasks on live websites.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-01-25T03:33:18Z","title":"WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models"},"references":{"count":13,"internal_anchors":3,"resolved_work":13,"sample":[{"cited_arxiv_id":"2306.06070","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Mind2Web: Towards a Generalist Agent for the Web","work_id":"e26f5a00-c007-439d-83f6-7900f5687b6b","year":2022},{"cited_arxiv_id":"2311.12983","doi":"","is_internal_anchor":true,"ref_index":2,"title":"GAIA: a benchmark for General AI Assistants","work_id":"cf222b33-f7a3-4044-a570-ecfe25edb3f8","year":2021},{"cited_arxiv_id":"2307.16789","doi":"","is_internal_anchor":true,"ref_index":3,"title":"ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs","work_id":"3c555b48-a4d9-42dd-9fdd-0f6018fbe9cb","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"E Additional Related Work Vision-based Agents Concurrent to our work, a few related works also studied vision-based au- tonomous agents","work_id":"46e0c71d-42d1-4988-8e0a-eb953f20364a","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"SeeClick (Cheng et al., 2024) focused on finetuning an LMM to solely leverage screenshots as inputs to interact Imagine you are a robot browsing the web, just like humans","work_id":"9eac54cc-e420-4e66-97ad-e6c50c1fbb34","year":2024}],"snapshot_sha256":"93c8a8653f79df6d2cea72bfd59789e0d33f2ccd8256022d05ee2cb5fa2e8bc2"},"source":{"id":"2401.13919","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-15T22:40:28.332635Z","id":"df4da1e6-a2a6-4d8f-a88d-4d16a1f6ada9","model_set":{"reader":"grok-4.3"},"one_line_summary":"WebVoyager uses a large multimodal model to complete real-world web tasks end-to-end and reaches 59.1 percent success on a new benchmark of 15 live sites, with an automatic GPT-4V evaluator that matches human judgments 85 percent of the time.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"WebVoyager shows that large multimodal models can drive an end-to-end agent that completes open-ended tasks on live websites.","strongest_claim":"WebVoyager achieves a 59.1% task success rate on our benchmark, significantly surpassing the performance of both GPT-4 (All Tools) and the WebVoyager (text-only) setups.","weakest_assumption":"The automatic evaluation protocol using GPT-4V multimodal understanding accurately reflects human judgment of task completion on real websites."}},"verdict_id":"df4da1e6-a2a6-4d8f-a88d-4d16a1f6ada9"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:39c6d14c8bab43ed26d9e219805a78c40c12705529a2a629a63033af5f6237f1","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"6702bf694a1a72778b94438e24cf82d3027090c9f99ae685bb236c7044f0866d","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-01-25T03:33:18Z","title_canon_sha256":"420fdf3b4a312821fda1b59f2bce724a229f966cb33653099f230c8a6b9c32b9"},"schema_version":"1.0","source":{"id":"2401.13919","kind":"arxiv","version":4}},"canonical_sha256":"868d097e751aad0179193951e8c3721db835422f8f153a33260cf0509d42be8f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"868d097e751aad0179193951e8c3721db835422f8f153a33260cf0509d42be8f","first_computed_at":"2026-05-17T23:38:49.893202Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.893202Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"PFM8aLW68N0Dnsj36bsYMT3YZDwMk0IstIrE9BIy6Y5QxOVHIY7o/ditKPv2rtivfIINilpzThAEW7PEaQklCw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.893688Z","signed_message":"canonical_sha256_bytes"},"source_id":"2401.13919","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:39c6d14c8bab43ed26d9e219805a78c40c12705529a2a629a63033af5f6237f1","sha256:1a1807cf205b4e72795b47cd9e788525b1fa1642aa9b70692adc2ae8917b7fd0"],"state_sha256":"c4f454f28ccf37f0b51590dd99bca7f673602f37a063b6052911bac0173be429"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"y7eeYc1JFsVL/z8I33z3q0JV2UfY5t9cLiu/aZYg/OmuEh/12ZIC261u0TXwDRVB2OWvwem3a7MBY/gzD0W8BA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-08T12:43:02.929693Z","bundle_sha256":"1d4f7fd5e81cac471f4846032c2bbfdf26a57fe86f8cebe2c3e2871b65e57444"}}