{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:YUTF4YCITC3EGOBXRLV65Q3MUC","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"c5f8fd01b2b3b4d5ef5e685f92a621b6282414be9d2e0bdc8ea8d15f6eb155eb","cross_cats_sorted":["cs.AI","cs.CL","cs.CV"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2024-01-03T08:33:09Z","title_canon_sha256":"226fa896a9db28a6cfee31311a098a43f3414f63a0fab3c5023cb7fce7453933"},"schema_version":"1.0","source":{"id":"2401.01614","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2401.01614","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2401.01614v2","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.01614","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"YUTF4YCITC3E","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"YUTF4YCITC3EGOBX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"YUTF4YCI","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:7dd4cbd1d3468f1d4eebb4189287fb7bfff668c90e1bf5f00038cd81c835383c","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"we show that GPT-4V presents a great potential for web agents -- it can successfully complete 51.1 of the tasks on live websites if we manually ground its textual plans into actions on the websites."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That manual grounding of the model's textual plans provides a valid upper-bound proxy for evaluating the agent's planning and reasoning capability, while automatic grounding methods remain underdeveloped."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"GPT-4V achieves 51.1% success on live web tasks as a generalist agent when plans are manually grounded, outperforming text-only models, but automatic grounding lags far behind oracle performance."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"GPT-4V completes 51.1 percent of tasks on live websites when its textual plans are manually grounded into actions."}],"snapshot_sha256":"f21015be4d03b1371becfa7055fb740a1eea13576647e352e660594c367d329c"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"26ff05c78d1f57e1a39b54c923415ee22bbbcac73d1fd0114e64422de5386f5d"},"paper":{"abstract_excerpt":"The recent development on large multimodal models (LMMs), especially GPT-4V(ision) and Gemini, has been quickly expanding the capability boundaries of multimodal models beyond traditional tasks like image captioning and visual question answering. In this work, we explore the potential of LMMs like GPT-4V as a generalist web agent that can follow natural language instructions to complete tasks on any given website. We propose SEEACT, a generalist web agent that harnesses the power of LMMs for integrated visual understanding and acting on the web. We evaluate on the recent MIND2WEB benchmark. In","authors_text":"Boyuan Zheng, Boyu Gou, Huan Sun, Jihyung Kil, Yu Su","cross_cats":["cs.AI","cs.CL","cs.CV"],"headline":"GPT-4V completes 51.1 percent of tasks on live websites when its textual plans are manually grounded into actions.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2024-01-03T08:33:09Z","title":"GPT-4V(ision) is a Generalist Web Agent, if Grounded"},"references":{"count":42,"internal_anchors":13,"resolved_work":42,"sample":[{"cited_arxiv_id":"2204.14198","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Flamingo: a Visual Language Model for Few-Shot Learning","work_id":"a110f764-38dc-41b2-a802-53744ecea1fc","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"org/CorpusID:248476411","work_id":"c474190e-eb6d-4bb6-b3c1-6316440acd57","year":null},{"cited_arxiv_id":"2306.15195","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic","work_id":"44525076-312a-4259-b79c-134cd7eeb297","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"org/CorpusID:259262082","work_id":"3a62ab25-76b6-49dd-9ad6-bc5a137343a9","year":null},{"cited_arxiv_id":"2210.11416","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Scaling Instruction-Finetuned Language Models","work_id":"8405abb1-7558-4fdf-af24-f4c52fa77a06","year":null}],"snapshot_sha256":"e2f7c88029e9af74c8c3cf54814800bba9b325275bae538d70b4b2e7f7a71ae8"},"source":{"id":"2401.01614","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T19:37:24.478166Z","id":"453bd831-211d-4e90-b218-340e0ca1b4d7","model_set":{"reader":"grok-4.3"},"one_line_summary":"GPT-4V achieves 51.1% success on live web tasks as a generalist agent when plans are manually grounded, outperforming text-only models, but automatic grounding lags far behind oracle performance.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"GPT-4V completes 51.1 percent of tasks on live websites when its textual plans are manually grounded into actions.","strongest_claim":"we show that GPT-4V presents a great potential for web agents -- it can successfully complete 51.1 of the tasks on live websites if we manually ground its textual plans into actions on the websites.","weakest_assumption":"That manual grounding of the model's textual plans provides a valid upper-bound proxy for evaluating the agent's planning and reasoning capability, while automatic grounding methods remain underdeveloped."}},"verdict_id":"453bd831-211d-4e90-b218-340e0ca1b4d7"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:eb317688bdc8e02fd5cd139e5af17d3525444e5989b99d5ae989db3ba9afd589","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"c5f8fd01b2b3b4d5ef5e685f92a621b6282414be9d2e0bdc8ea8d15f6eb155eb","cross_cats_sorted":["cs.AI","cs.CL","cs.CV"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2024-01-03T08:33:09Z","title_canon_sha256":"226fa896a9db28a6cfee31311a098a43f3414f63a0fab3c5023cb7fce7453933"},"schema_version":"1.0","source":{"id":"2401.01614","kind":"arxiv","version":2}},"canonical_sha256":"c5265e604898b64338378aebeec36ca0a9bd6641f715b6691a2cec878dad0d8f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c5265e604898b64338378aebeec36ca0a9bd6641f715b6691a2cec878dad0d8f","first_computed_at":"2026-05-17T23:38:50.385915Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.385915Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"XB6YPGyWMPXx7YKvHhX1DneBUxLpUDx3DCvz5ZlbxyWQvbJB35nZOrYmF+TRvjYhJ58CseOIncYCCaxY3gbQCg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.386346Z","signed_message":"canonical_sha256_bytes"},"source_id":"2401.01614","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:eb317688bdc8e02fd5cd139e5af17d3525444e5989b99d5ae989db3ba9afd589","sha256:7dd4cbd1d3468f1d4eebb4189287fb7bfff668c90e1bf5f00038cd81c835383c"],"state_sha256":"a772ff5cecd3880fe9e9c1a5d24499e9776856e527e2c5413b485310d3bc0e9e"}