{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:RSD5FDDU5HITB7RDVZUEKBECT2","short_pith_number":"pith:RSD5FDDU","schema_version":"1.0","canonical_sha256":"8c87d28c74e9d130fe23ae684504829e99c881c8f6b46e564fd69c0778e2e2a7","source":{"kind":"arxiv","id":"2406.12373","version":3},"attestation_state":"computed","paper":{"title":"WebCanvas: Benchmarking Web Agents in Online Environments","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Bing Jiang, Cheng Cui, Dehan Kong, Hangyu Liu, Shuyan Zhou, Sida Zhou, Tongshuang Wu, Yanyi Shang, Yichen Pan, Yifei Leng, Zhengyang Wu","submitted_at":"2024-06-18T07:58:33Z","abstract_excerpt":"For web agents to be practically useful, they must adapt to the continuously evolving web environment characterized by frequent updates to user interfaces and content. However, most existing benchmarks only capture the static aspects of the web. To bridge this gap, we introduce WebCanvas, an innovative online evaluation framework for web agents that effectively addresses the dynamic nature of web interactions. WebCanvas contains three main components to facilitate realistic assessments: (1) A novel evaluation metric which reliably capture critical intermediate actions or states necessary for t"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2406.12373","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2024-06-18T07:58:33Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"a08c8d164bed182dbe42b68826b418c245abac6234ec1bdd4b3a065a2fa214b9","abstract_canon_sha256":"d9743c3cc6b9480d307ff37f77945dee3724a0cb4f27e278e62ff7460d7f5719"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T11:23:51.087480Z","signature_b64":"dXYf/hFbRoAbQGYMacqduVRRPxamlZkX/xAWXlFcldVKzGepqsu8htgiyCBCHo4YTI+Ex54JxrW2tUtxSUH1CA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8c87d28c74e9d130fe23ae684504829e99c881c8f6b46e564fd69c0778e2e2a7","last_reissued_at":"2026-05-20T11:23:51.085608Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T11:23:51.085608Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"WebCanvas: Benchmarking Web Agents in Online Environments","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Bing Jiang, Cheng Cui, Dehan Kong, Hangyu Liu, Shuyan Zhou, Sida Zhou, Tongshuang Wu, Yanyi Shang, Yichen Pan, Yifei Leng, Zhengyang Wu","submitted_at":"2024-06-18T07:58:33Z","abstract_excerpt":"For web agents to be practically useful, they must adapt to the continuously evolving web environment characterized by frequent updates to user interfaces and content. However, most existing benchmarks only capture the static aspects of the web. To bridge this gap, we introduce WebCanvas, an innovative online evaluation framework for web agents that effectively addresses the dynamic nature of web interactions. WebCanvas contains three main components to facilitate realistic assessments: (1) A novel evaluation metric which reliably capture critical intermediate actions or states necessary for t"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2406.12373","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2406.12373/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2406.12373","created_at":"2026-05-20T11:23:51.085686+00:00"},{"alias_kind":"arxiv_version","alias_value":"2406.12373v3","created_at":"2026-05-20T11:23:51.085686+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.12373","created_at":"2026-05-20T11:23:51.085686+00:00"},{"alias_kind":"pith_short_12","alias_value":"RSD5FDDU5HIT","created_at":"2026-05-20T11:23:51.085686+00:00"},{"alias_kind":"pith_short_16","alias_value":"RSD5FDDU5HITB7RD","created_at":"2026-05-20T11:23:51.085686+00:00"},{"alias_kind":"pith_short_8","alias_value":"RSD5FDDU","created_at":"2026-05-20T11:23:51.085686+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":22,"internal_anchor_count":22,"sample":[{"citing_arxiv_id":"2604.27245","citing_title":"Addressing the Reality Gap: A Three-Tension Framework for Agentic AI Adoption","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16402","citing_title":"WinDeskGround: A Benchmark for Robust GUI Grounding in Complex Multi-Window Desktop Environments","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18048","citing_title":"DocOS: Towards Proactive Document-Guided Actions in GUI Agents","ref_index":74,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19219","citing_title":"SimGym: A Framework for A/B Test Simulation in E-Commerce with Traffic-Grounded VLM Agents","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2506.03610","citing_title":"Orak: A Foundational Benchmark for Training and Evaluating LLM Agents on Diverse Video Games","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2508.15832","citing_title":"A Functionality-Grounded Benchmark for Evaluating Web Agents in E-commerce Domains","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2509.14528","citing_title":"Why Johnny Can't Use Agents: Industry Aspirations vs. User Realities with AI Agents","ref_index":62,"is_internal_anchor":true},{"citing_arxiv_id":"2412.04454","citing_title":"Aguvis: Unified Pure Vision Agents for Autonomous GUI Interaction","ref_index":102,"is_internal_anchor":true},{"citing_arxiv_id":"2603.05044","citing_title":"WebFactory: Automated Compression of Foundational Language Intelligence into Grounded Web Agents","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2507.21046","citing_title":"A Survey of Self-Evolving Agents: What, When, How, and Where to Evolve on the Path to Artificial Super Intelligence","ref_index":151,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27955","citing_title":"GUI Agents with Reinforcement Learning: Toward Digital Inhabitants","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2604.28139","citing_title":"Claw-Eval-Live: A Live Agent Benchmark for Evolving Real-World Workflows","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10834","citing_title":"From Controlled to the Wild: Evaluation of Pentesting Agents for the Real-World","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08545","citing_title":"Log analysis is necessary for credible evaluation of AI agents","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18543","citing_title":"ClawEnvKit: Automatic Environment Generation for Claw-Like Agents","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10988","citing_title":"WebForge: Breaking the Realism-Reproducibility-Scalability Trilemma in Browser Agent Benchmark","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08523","citing_title":"ClawBench: Can AI Agents Complete Everyday Online Tasks?","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08516","citing_title":"MolmoWeb: Open Visual Web Agent and Open Data for the Open Web","ref_index":71,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13531","citing_title":"RiskWebWorld: A Realistic Interactive Benchmark for GUI Agents in E-commerce Risk Management","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2604.17091","citing_title":"GenericAgent: A Token-Efficient Self-Evolving LLM Agent via Contextual Information Density Maximization (V1.0)","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2605.02729","citing_title":"Augmenting Interface Usability Heuristics for Reliable Computer-Use Agents","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27245","citing_title":"Addressing the Reality Gap: A Three-Tension Framework for Agentic AI Adoption","ref_index":45,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/RSD5FDDU5HITB7RDVZUEKBECT2","json":"https://pith.science/pith/RSD5FDDU5HITB7RDVZUEKBECT2.json","graph_json":"https://pith.science/api/pith-number/RSD5FDDU5HITB7RDVZUEKBECT2/graph.json","events_json":"https://pith.science/api/pith-number/RSD5FDDU5HITB7RDVZUEKBECT2/events.json","paper":"https://pith.science/paper/RSD5FDDU"},"agent_actions":{"view_html":"https://pith.science/pith/RSD5FDDU5HITB7RDVZUEKBECT2","download_json":"https://pith.science/pith/RSD5FDDU5HITB7RDVZUEKBECT2.json","view_paper":"https://pith.science/paper/RSD5FDDU","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2406.12373&json=true","fetch_graph":"https://pith.science/api/pith-number/RSD5FDDU5HITB7RDVZUEKBECT2/graph.json","fetch_events":"https://pith.science/api/pith-number/RSD5FDDU5HITB7RDVZUEKBECT2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/RSD5FDDU5HITB7RDVZUEKBECT2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/RSD5FDDU5HITB7RDVZUEKBECT2/action/storage_attestation","attest_author":"https://pith.science/pith/RSD5FDDU5HITB7RDVZUEKBECT2/action/author_attestation","sign_citation":"https://pith.science/pith/RSD5FDDU5HITB7RDVZUEKBECT2/action/citation_signature","submit_replication":"https://pith.science/pith/RSD5FDDU5HITB7RDVZUEKBECT2/action/replication_record"}},"created_at":"2026-05-20T11:23:51.085686+00:00","updated_at":"2026-05-20T11:23:51.085686+00:00"}