{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:B6N5RPC67O33FJY2I5FPMYLZZO","short_pith_number":"pith:B6N5RPC6","canonical_record":{"source":{"id":"2403.07718","kind":"arxiv","version":5},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-03-12T14:58:45Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"6ac53eeabc9ba4a7957514da4595c3bd216575a61e7de3fd99f2fd3b9d5a0af2","abstract_canon_sha256":"241cc0cc95b853603ea2fb29976c470fc5f752468f33e0ea0bfdf7a31e2cb398"},"schema_version":"1.0"},"canonical_sha256":"0f9bd8bc5efbb7b2a71a474af66179cb8b53111d2184deeaedb5e532799e08ad","source":{"kind":"arxiv","id":"2403.07718","version":5},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2403.07718","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2403.07718v5","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2403.07718","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"B6N5RPC67O33","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"B6N5RPC67O33FJY2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"B6N5RPC6","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:B6N5RPC67O33FJY2I5FPMYLZZO","target":"record","payload":{"canonical_record":{"source":{"id":"2403.07718","kind":"arxiv","version":5},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-03-12T14:58:45Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"6ac53eeabc9ba4a7957514da4595c3bd216575a61e7de3fd99f2fd3b9d5a0af2","abstract_canon_sha256":"241cc0cc95b853603ea2fb29976c470fc5f752468f33e0ea0bfdf7a31e2cb398"},"schema_version":"1.0"},"canonical_sha256":"0f9bd8bc5efbb7b2a71a474af66179cb8b53111d2184deeaedb5e532799e08ad","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.769963Z","signature_b64":"+ixWyl9YBe7Eg3koRFjX8ZiY7UGxG+rfZV8eogzqAjAEt5PORpnnA+to4h2IUGCnqggNIn9Mr4gnPh3y09zQCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0f9bd8bc5efbb7b2a71a474af66179cb8b53111d2184deeaedb5e532799e08ad","last_reissued_at":"2026-05-17T23:38:53.769379Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.769379Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2403.07718","source_version":5,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"emFsWPs1PAy6VpKdKwGblpoaSEbb+fUASYEZG75Mp27BzFTHpBdnNFWPqXUzMCxUxvZp9fDXP4yahNCTe9fWAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T09:27:15.460090Z"},"content_sha256":"46640c8eb329f311afa8f9b695510b4f353304552f56642dc3a5a1da6cba6ac8","schema_version":"1.0","event_id":"sha256:46640c8eb329f311afa8f9b695510b4f353304552f56642dc3a5a1da6cba6ac8"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:B6N5RPC67O33FJY2I5FPMYLZZO","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"WorkArena: How Capable Are Web Agents at Solving Common Knowledge Work Tasks?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Web agents based on large language models show some success on enterprise tasks but leave a large gap to full automation","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Alexandre Drouin, Alexandre Lacoste, David Vazquez, Issam H. Laradji, L\\'eo Boisvert, Manuel Del Verme, Massimo Caccia, Maxime Gasse, Megh Thakkar, Nicolas Chapados, Quentin Cappart, Tom Marty","submitted_at":"2024-03-12T14:58:45Z","abstract_excerpt":"We study the use of large language model-based agents for interacting with software via web browsers. Unlike prior work, we focus on measuring the agents' ability to perform tasks that span the typical daily work of knowledge workers utilizing enterprise software systems. To this end, we propose WorkArena, a remote-hosted benchmark of 33 tasks based on the widely-used ServiceNow platform. We also introduce BrowserGym, an environment for the design and evaluation of such agents, offering a rich set of actions as well as multimodal observations. Our empirical evaluation reveals that while curren"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"while current agents show promise on WorkArena, there remains a considerable gap towards achieving full task automation. Notably, our analysis uncovers a significant performance disparity between open and closed-source LLMs","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The 33 tasks chosen for WorkArena are representative of the typical daily work of knowledge workers utilizing enterprise software systems.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"WorkArena benchmark shows LLM web agents achieve partial success on enterprise tasks but have a substantial gap to full automation and perform worse with open-source models.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Web agents based on large language models show some success on enterprise tasks but leave a large gap to full automation","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"99eae5455327ca52df7e426517b3caea16eaa209b0b3b877ef246bfc8ca5c8bc"},"source":{"id":"2403.07718","kind":"arxiv","version":5},"verdict":{"id":"59c8eea2-8330-41f8-b393-4a1b9036dca7","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T02:44:17.853901Z","strongest_claim":"while current agents show promise on WorkArena, there remains a considerable gap towards achieving full task automation. Notably, our analysis uncovers a significant performance disparity between open and closed-source LLMs","one_line_summary":"WorkArena benchmark shows LLM web agents achieve partial success on enterprise tasks but have a substantial gap to full automation and perform worse with open-source models.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The 33 tasks chosen for WorkArena are representative of the typical daily work of knowledge workers utilizing enterprise software systems.","pith_extraction_headline":"Web agents based on large language models show some success on enterprise tasks but leave a large gap to full automation"},"references":{"count":36,"sample":[{"doi":"","year":2023,"title":"The unsolved challenges of LLM s in open-ended web tasks: A case study","work_id":"5ff9f205-dfd5-47b6-bf24-94b6b13e38d2","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2016,"title":"Brockman, G., Cheung, V., Pettersson, L., Schneider, J., Schulman, J., Tang, J., and Zaremba, W. OpenAI gym, 2016","work_id":"83339e82-95f2-40c8-8327-942e29e38e35","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Mind2Web: Towards a Generalist Agent for the Web","work_id":"e26f5a00-c007-439d-83f6-7900f5687b6b","ref_index":3,"cited_arxiv_id":"2306.06070","is_internal_anchor":true},{"doi":"","year":2023,"title":"Multimodal web navigation with instruction-finetuned foundation models","work_id":"0f8b8630-9215-4cb8-9b7d-e58e6b1f7bbb","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Chrome devtools protocol, 2023","work_id":"2054d990-9253-491b-b01d-1f23d15eec34","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":36,"snapshot_sha256":"236e2f21fc6936dfb7b0b7ac3095c0be380ac0a3595b37d93338e84eda289677","internal_anchors":12},"formal_canon":{"evidence_count":1,"snapshot_sha256":"3f85100ae17c2c1ab72b396de0adc1a89c6521682565e5b41862715d03fd9db0"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"59c8eea2-8330-41f8-b393-4a1b9036dca7"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"hNQFOZUr0AlQfh0gOch4nbH0O/vglfOG4Dr5LhZ4pzN5aEwR4Bg4X8GHgLUhRw6727z7YeeZzcDraACG7vLYBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T09:27:15.460643Z"},"content_sha256":"acfd0da439e40d2b435192ad541c14fd31e6956c3e87b88cc102fecdd9e215da","schema_version":"1.0","event_id":"sha256:acfd0da439e40d2b435192ad541c14fd31e6956c3e87b88cc102fecdd9e215da"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/B6N5RPC67O33FJY2I5FPMYLZZO/bundle.json","state_url":"https://pith.science/pith/B6N5RPC67O33FJY2I5FPMYLZZO/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/B6N5RPC67O33FJY2I5FPMYLZZO/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-05T09:27:15Z","links":{"resolver":"https://pith.science/pith/B6N5RPC67O33FJY2I5FPMYLZZO","bundle":"https://pith.science/pith/B6N5RPC67O33FJY2I5FPMYLZZO/bundle.json","state":"https://pith.science/pith/B6N5RPC67O33FJY2I5FPMYLZZO/state.json","well_known_bundle":"https://pith.science/.well-known/pith/B6N5RPC67O33FJY2I5FPMYLZZO/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:B6N5RPC67O33FJY2I5FPMYLZZO","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"241cc0cc95b853603ea2fb29976c470fc5f752468f33e0ea0bfdf7a31e2cb398","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-03-12T14:58:45Z","title_canon_sha256":"6ac53eeabc9ba4a7957514da4595c3bd216575a61e7de3fd99f2fd3b9d5a0af2"},"schema_version":"1.0","source":{"id":"2403.07718","kind":"arxiv","version":5}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2403.07718","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2403.07718v5","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2403.07718","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"B6N5RPC67O33","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"B6N5RPC67O33FJY2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"B6N5RPC6","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:acfd0da439e40d2b435192ad541c14fd31e6956c3e87b88cc102fecdd9e215da","target":"graph","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"while current agents show promise on WorkArena, there remains a considerable gap towards achieving full task automation. Notably, our analysis uncovers a significant performance disparity between open and closed-source LLMs"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The 33 tasks chosen for WorkArena are representative of the typical daily work of knowledge workers utilizing enterprise software systems."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"WorkArena benchmark shows LLM web agents achieve partial success on enterprise tasks but have a substantial gap to full automation and perform worse with open-source models."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Web agents based on large language models show some success on enterprise tasks but leave a large gap to full automation"}],"snapshot_sha256":"99eae5455327ca52df7e426517b3caea16eaa209b0b3b877ef246bfc8ca5c8bc"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"3f85100ae17c2c1ab72b396de0adc1a89c6521682565e5b41862715d03fd9db0"},"paper":{"abstract_excerpt":"We study the use of large language model-based agents for interacting with software via web browsers. Unlike prior work, we focus on measuring the agents' ability to perform tasks that span the typical daily work of knowledge workers utilizing enterprise software systems. To this end, we propose WorkArena, a remote-hosted benchmark of 33 tasks based on the widely-used ServiceNow platform. We also introduce BrowserGym, an environment for the design and evaluation of such agents, offering a rich set of actions as well as multimodal observations. Our empirical evaluation reveals that while curren","authors_text":"Alexandre Drouin, Alexandre Lacoste, David Vazquez, Issam H. Laradji, L\\'eo Boisvert, Manuel Del Verme, Massimo Caccia, Maxime Gasse, Megh Thakkar, Nicolas Chapados, Quentin Cappart, Tom Marty","cross_cats":["cs.AI"],"headline":"Web agents based on large language models show some success on enterprise tasks but leave a large gap to full automation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-03-12T14:58:45Z","title":"WorkArena: How Capable Are Web Agents at Solving Common Knowledge Work Tasks?"},"references":{"count":36,"internal_anchors":12,"resolved_work":36,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"The unsolved challenges of LLM s in open-ended web tasks: A case study","work_id":"5ff9f205-dfd5-47b6-bf24-94b6b13e38d2","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Brockman, G., Cheung, V., Pettersson, L., Schneider, J., Schulman, J., Tang, J., and Zaremba, W. OpenAI gym, 2016","work_id":"83339e82-95f2-40c8-8327-942e29e38e35","year":2016},{"cited_arxiv_id":"2306.06070","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Mind2Web: Towards a Generalist Agent for the Web","work_id":"e26f5a00-c007-439d-83f6-7900f5687b6b","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Multimodal web navigation with instruction-finetuned foundation models","work_id":"0f8b8630-9215-4cb8-9b7d-e58e6b1f7bbb","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Chrome devtools protocol, 2023","work_id":"2054d990-9253-491b-b01d-1f23d15eec34","year":2023}],"snapshot_sha256":"236e2f21fc6936dfb7b0b7ac3095c0be380ac0a3595b37d93338e84eda289677"},"source":{"id":"2403.07718","kind":"arxiv","version":5},"verdict":{"created_at":"2026-05-15T02:44:17.853901Z","id":"59c8eea2-8330-41f8-b393-4a1b9036dca7","model_set":{"reader":"grok-4.3"},"one_line_summary":"WorkArena benchmark shows LLM web agents achieve partial success on enterprise tasks but have a substantial gap to full automation and perform worse with open-source models.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Web agents based on large language models show some success on enterprise tasks but leave a large gap to full automation","strongest_claim":"while current agents show promise on WorkArena, there remains a considerable gap towards achieving full task automation. Notably, our analysis uncovers a significant performance disparity between open and closed-source LLMs","weakest_assumption":"The 33 tasks chosen for WorkArena are representative of the typical daily work of knowledge workers utilizing enterprise software systems."}},"verdict_id":"59c8eea2-8330-41f8-b393-4a1b9036dca7"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:46640c8eb329f311afa8f9b695510b4f353304552f56642dc3a5a1da6cba6ac8","target":"record","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"241cc0cc95b853603ea2fb29976c470fc5f752468f33e0ea0bfdf7a31e2cb398","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-03-12T14:58:45Z","title_canon_sha256":"6ac53eeabc9ba4a7957514da4595c3bd216575a61e7de3fd99f2fd3b9d5a0af2"},"schema_version":"1.0","source":{"id":"2403.07718","kind":"arxiv","version":5}},"canonical_sha256":"0f9bd8bc5efbb7b2a71a474af66179cb8b53111d2184deeaedb5e532799e08ad","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0f9bd8bc5efbb7b2a71a474af66179cb8b53111d2184deeaedb5e532799e08ad","first_computed_at":"2026-05-17T23:38:53.769379Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:53.769379Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"+ixWyl9YBe7Eg3koRFjX8ZiY7UGxG+rfZV8eogzqAjAEt5PORpnnA+to4h2IUGCnqggNIn9Mr4gnPh3y09zQCA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:53.769963Z","signed_message":"canonical_sha256_bytes"},"source_id":"2403.07718","source_kind":"arxiv","source_version":5}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:46640c8eb329f311afa8f9b695510b4f353304552f56642dc3a5a1da6cba6ac8","sha256:acfd0da439e40d2b435192ad541c14fd31e6956c3e87b88cc102fecdd9e215da"],"state_sha256":"83430c8547acac4c2a0241dbbbecfd1e7ef43bc89251e8c365f56d351279142c"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"xF8kvXOx6szNWaFn4Pa739Myd5zOj8RjwsacJxiXiNERqqQYsyqVWkJCPBqNZJzc7TBoqehD8khH3C7JXrzlCg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-05T09:27:15.463098Z","bundle_sha256":"60ccb8403a166a02953875c9a1726a47e4019d58063d804cca87a6d8bef622f6"}}