{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:7LNTAEUAU5EJVFRRZOFONBXRUQ","short_pith_number":"pith:7LNTAEUA","schema_version":"1.0","canonical_sha256":"fadb301280a7489a9631cb8ae686f1a405574087d30f18e3b98d1b87c43a46d7","source":{"kind":"arxiv","id":"2411.18279","version":12},"attestation_state":"computed","paper":{"title":"Large Language Model-Brained GUI Agents: A Survey","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.HC"],"primary_cat":"cs.AI","authors_text":"Bowen Li, Chaoyun Zhang, Dongmei Zhang, Guyue Liu, Jiaxu Qian, Liqun Li, Minghua Ma, Qingwei Lin, Qi Zhang, Saravan Rajmohan, Shilin He, Si Qin, Yu Kang","submitted_at":"2024-11-27T12:13:39Z","abstract_excerpt":"GUIs have long been central to human-computer interaction, providing an intuitive and visually-driven way to access and interact with digital systems. The advent of LLMs, particularly multimodal models, has ushered in a new era of GUI automation. They have demonstrated exceptional capabilities in natural language understanding, code generation, and visual processing. This has paved the way for a new generation of LLM-brained GUI agents capable of interpreting complex GUI elements and autonomously executing actions based on natural language instructions. These agents represent a paradigm shift,"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2411.18279","kind":"arxiv","version":12},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2024-11-27T12:13:39Z","cross_cats_sorted":["cs.CL","cs.HC"],"title_canon_sha256":"c0f436a39c81e89476c667884424f5e3e68ed6ca67a64cf426007cb0bbde752e","abstract_canon_sha256":"3447e7fb6deea3fe0ffc38af64ce9392c3600176c70b59627b0d41646d8c55c0"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-19T11:02:19.763222Z","signature_b64":"9PllyHkRYi7bWTGIYsnrjmweQDFLYuJXMSjq7hdlS/Fh0vUwvmKHD0fJDrfdyVCXCLWbsIRitgSRndL4mNRAAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fadb301280a7489a9631cb8ae686f1a405574087d30f18e3b98d1b87c43a46d7","last_reissued_at":"2026-05-19T11:02:19.761363Z","signature_status":"signed_v1","first_computed_at":"2026-05-19T11:02:19.761363Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Large Language Model-Brained GUI Agents: A Survey","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.HC"],"primary_cat":"cs.AI","authors_text":"Bowen Li, Chaoyun Zhang, Dongmei Zhang, Guyue Liu, Jiaxu Qian, Liqun Li, Minghua Ma, Qingwei Lin, Qi Zhang, Saravan Rajmohan, Shilin He, Si Qin, Yu Kang","submitted_at":"2024-11-27T12:13:39Z","abstract_excerpt":"GUIs have long been central to human-computer interaction, providing an intuitive and visually-driven way to access and interact with digital systems. The advent of LLMs, particularly multimodal models, has ushered in a new era of GUI automation. They have demonstrated exceptional capabilities in natural language understanding, code generation, and visual processing. This has paved the way for a new generation of LLM-brained GUI agents capable of interpreting complex GUI elements and autonomously executing actions based on natural language instructions. These agents represent a paradigm shift,"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2411.18279","kind":"arxiv","version":12},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2411.18279/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2411.18279","created_at":"2026-05-19T11:02:19.761443+00:00"},{"alias_kind":"arxiv_version","alias_value":"2411.18279v12","created_at":"2026-05-19T11:02:19.761443+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2411.18279","created_at":"2026-05-19T11:02:19.761443+00:00"},{"alias_kind":"pith_short_12","alias_value":"7LNTAEUAU5EJ","created_at":"2026-05-19T11:02:19.761443+00:00"},{"alias_kind":"pith_short_16","alias_value":"7LNTAEUAU5EJVFRR","created_at":"2026-05-19T11:02:19.761443+00:00"},{"alias_kind":"pith_short_8","alias_value":"7LNTAEUA","created_at":"2026-05-19T11:02:19.761443+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":23,"internal_anchor_count":23,"sample":[{"citing_arxiv_id":"2501.16150","citing_title":"A Comprehensive Survey of Agents for Computer Use: Foundations, Challenges, and Future Directions","ref_index":182,"is_internal_anchor":true},{"citing_arxiv_id":"2502.08691","citing_title":"AgentSociety: Large-Scale Simulation of LLM-Driven Generative Agents Advances Understanding of Human Behaviors and Society","ref_index":111,"is_internal_anchor":true},{"citing_arxiv_id":"2506.23978","citing_title":"LLM Agents Are the Antidote to Walled Gardens","ref_index":105,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21082","citing_title":"AutoRPA: Efficient GUI Automation through LLM-Driven Code Synthesis from Interactions","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2506.09373","citing_title":"LPO: Towards Accurate GUI Agent Interaction via Location Preference Optimization","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2507.10610","citing_title":"LaSM: Layer-wise Scaling Mechanism for Defending Pop-up Attack on GUI Agents","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2509.07553","citing_title":"VeriOS: Query-Driven Proactive Human-Agent-GUI Interaction for Trustworthy OS Agents","ref_index":68,"is_internal_anchor":true},{"citing_arxiv_id":"2509.14528","citing_title":"Why Johnny Can't Use Agents: Industry Aspirations vs. User Realities with AI Agents","ref_index":89,"is_internal_anchor":true},{"citing_arxiv_id":"2509.21816","citing_title":"From Task to Tutorial: An Automated GUI Framework for Excel Tutorial Document and Video Creation","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2509.21982","citing_title":"RISK: A Framework for GUI Agents in E-commerce Risk Management","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2510.24168","citing_title":"MGA: Memory-Driven GUI Agent for Observation-Centric Interaction","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2503.09572","citing_title":"Plan-and-Act: Improving Planning of Agents for Long-Horizon Tasks","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2504.19678","citing_title":"From LLM Reasoning to Autonomous AI Agents: A Comprehensive Review","ref_index":62,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12549","citing_title":"What Happens Before Decoding? Prefill Determines GUI Grounding in VLMs","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27996","citing_title":"Exploring Interaction Paradigms for LLM Agents in Scientific Visualization","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2604.03976","citing_title":"Quantifying Trust: Financial Risk Management for Trustworthy AI Agents","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2509.02544","citing_title":"UI-TARS-2 Technical Report: Advancing GUI Agent with Multi-Turn Reinforcement Learning","ref_index":88,"is_internal_anchor":true},{"citing_arxiv_id":"2602.12430","citing_title":"Agent Skills for Large Language Models: Architecture, Acquisition, Security, and the Path Forward","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27996","citing_title":"Exploring Interaction Paradigms for LLM Agents in Scientific Visualization","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2604.21268","citing_title":"Measure Twice, Click Once: Co-evolving Proposer and Visual Critic via Reinforcement Learning for GUI Grounding","ref_index":70,"is_internal_anchor":true},{"citing_arxiv_id":"2604.11259","citing_title":"Mobile GUI Agent Privacy Personalization with Trajectory Induced Preference Optimization","ref_index":40,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06134","citing_title":"MAESTRO: Adapting GUIs and Guiding Navigation with User Preferences in Conversational Agents with GUIs","ref_index":54,"is_internal_anchor":true},{"citing_arxiv_id":"2604.21375","citing_title":"VLAA-GUI: Knowing When to Stop, Recover, and Search, A Modular Framework for GUI Automation","ref_index":79,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7LNTAEUAU5EJVFRRZOFONBXRUQ","json":"https://pith.science/pith/7LNTAEUAU5EJVFRRZOFONBXRUQ.json","graph_json":"https://pith.science/api/pith-number/7LNTAEUAU5EJVFRRZOFONBXRUQ/graph.json","events_json":"https://pith.science/api/pith-number/7LNTAEUAU5EJVFRRZOFONBXRUQ/events.json","paper":"https://pith.science/paper/7LNTAEUA"},"agent_actions":{"view_html":"https://pith.science/pith/7LNTAEUAU5EJVFRRZOFONBXRUQ","download_json":"https://pith.science/pith/7LNTAEUAU5EJVFRRZOFONBXRUQ.json","view_paper":"https://pith.science/paper/7LNTAEUA","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2411.18279&json=true","fetch_graph":"https://pith.science/api/pith-number/7LNTAEUAU5EJVFRRZOFONBXRUQ/graph.json","fetch_events":"https://pith.science/api/pith-number/7LNTAEUAU5EJVFRRZOFONBXRUQ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7LNTAEUAU5EJVFRRZOFONBXRUQ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7LNTAEUAU5EJVFRRZOFONBXRUQ/action/storage_attestation","attest_author":"https://pith.science/pith/7LNTAEUAU5EJVFRRZOFONBXRUQ/action/author_attestation","sign_citation":"https://pith.science/pith/7LNTAEUAU5EJVFRRZOFONBXRUQ/action/citation_signature","submit_replication":"https://pith.science/pith/7LNTAEUAU5EJVFRRZOFONBXRUQ/action/replication_record"}},"created_at":"2026-05-19T11:02:19.761443+00:00","updated_at":"2026-05-19T11:02:19.761443+00:00"}