{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:YOB6WEPEIDTSJQM7J245AF4U5Y","short_pith_number":"pith:YOB6WEPE","schema_version":"1.0","canonical_sha256":"c383eb11e440e724c19f4eb9d01794ee30501701125e2358f96777dd409447f8","source":{"kind":"arxiv","id":"2504.14239","version":1},"attestation_state":"computed","paper":{"title":"InfiGUI-R1: Advancing Multimodal GUI Agents from Reactive Actors to Deliberative Reasoners","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.AI","authors_text":"Congkai Xie, Fei Wu, Hongxia Yang, Pengxiang Li, Shengyu Zhang, Xavier Hu, Xiaotian Han, Yuhang Liu","submitted_at":"2025-04-19T09:25:55Z","abstract_excerpt":"Multimodal Large Language Models (MLLMs) have powered Graphical User Interface (GUI) Agents, showing promise in automating tasks on computing devices. Recent works have begun exploring reasoning in GUI tasks with encouraging results. However, many current approaches rely on manually designed reasoning templates, which may result in reasoning that is not sufficiently robust and adaptive for complex GUI environments. Meanwhile, some existing agents continue to operate as Reactive Actors, relying primarily on implicit reasoning that may lack sufficient depth for GUI tasks demanding planning and e"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2504.14239","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-04-19T09:25:55Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"d4983acf413c1a53b678216cb05e0d55080481671aa5d7ff0b1bfafbd2e62e4f","abstract_canon_sha256":"2aa09bdae55e96c9295f919d5492070f4bd93d2478c009b1433a0e41334222c3"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T13:48:13.235560Z","signature_b64":"sG8g3RMFtAyu9YsiOt15VJ+U5R3tnETl8nl9AxRUzx8JjAFM6BnLrtAIOlcb1NrGaj7e7NQmuI9vosKMZKpNBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c383eb11e440e724c19f4eb9d01794ee30501701125e2358f96777dd409447f8","last_reissued_at":"2026-05-18T13:48:13.232891Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T13:48:13.232891Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"InfiGUI-R1: Advancing Multimodal GUI Agents from Reactive Actors to Deliberative Reasoners","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.AI","authors_text":"Congkai Xie, Fei Wu, Hongxia Yang, Pengxiang Li, Shengyu Zhang, Xavier Hu, Xiaotian Han, Yuhang Liu","submitted_at":"2025-04-19T09:25:55Z","abstract_excerpt":"Multimodal Large Language Models (MLLMs) have powered Graphical User Interface (GUI) Agents, showing promise in automating tasks on computing devices. Recent works have begun exploring reasoning in GUI tasks with encouraging results. However, many current approaches rely on manually designed reasoning templates, which may result in reasoning that is not sufficiently robust and adaptive for complex GUI environments. Meanwhile, some existing agents continue to operate as Reactive Actors, relying primarily on implicit reasoning that may lack sufficient depth for GUI tasks demanding planning and e"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2504.14239","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2504.14239","created_at":"2026-05-18T13:48:13.233016+00:00"},{"alias_kind":"arxiv_version","alias_value":"2504.14239v1","created_at":"2026-05-18T13:48:13.233016+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.14239","created_at":"2026-05-18T13:48:13.233016+00:00"},{"alias_kind":"pith_short_12","alias_value":"YOB6WEPEIDTS","created_at":"2026-05-18T13:48:13.233016+00:00"},{"alias_kind":"pith_short_16","alias_value":"YOB6WEPEIDTSJQM7","created_at":"2026-05-18T13:48:13.233016+00:00"},{"alias_kind":"pith_short_8","alias_value":"YOB6WEPE","created_at":"2026-05-18T13:48:13.233016+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":23,"internal_anchor_count":23,"sample":[{"citing_arxiv_id":"2506.20332","citing_title":"Mobile-R1: Towards Interactive Capability for VLM-Based Mobile Agent via Systematic Training","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2508.19679","citing_title":"InquireMobile: Teaching VLM-based Mobile Agent to Request Human Assistance via Reinforcement Fine-Tuning","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2509.07553","citing_title":"VeriOS: Query-Driven Proactive Human-Agent-GUI Interaction for Trustworthy OS Agents","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2509.21816","citing_title":"From Task to Tutorial: An Automated GUI Framework for Excel Tutorial Document and Video Creation","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2509.21982","citing_title":"RISK: A Framework for GUI Agents in E-commerce Risk Management","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2510.24168","citing_title":"MGA: Memory-Driven GUI Agent for Observation-Centric Interaction","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2507.05791","citing_title":"GTA1: GUI Test-time Scaling Agent","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14311","citing_title":"Beyond Binary: Reframing GUI Critique as Continuous Semantic Alignment","ref_index":93,"is_internal_anchor":true},{"citing_arxiv_id":"2507.21046","citing_title":"A Survey of Self-Evolving Agents: What, When, How, and Where to Evolve on the Path to Artificial Super Intelligence","ref_index":246,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12501","citing_title":"Covering Human Action Space for Computer Use: Data Synthesis and Benchmark","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27859","citing_title":"A Brief Overview: Agentic Reinforcement Learning In Large Language Models","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00642","citing_title":"Learn where to Click from Yourself: On-Policy Self-Distillation for GUI Grounding","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10347","citing_title":"How Mobile World Model Guides GUI Agents?","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27859","citing_title":"A Brief Overview: Agentic Reinforcement Learning In Large Language Models","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24348","citing_title":"OS-SPEAR: A Toolkit for the Safety, Performance,Efficiency, and Robustness Analysis of OS Agents","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06534","citing_title":"ROSE: Rollout On Serving GPUs via Cooperative Elasticity for Agentic RL","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22558","citing_title":"SOLAR-RL: Semi-Online Long-horizon Assignment Reinforcement Learning","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06664","citing_title":"BAMI: Training-Free Bias Mitigation in GUI Grounding","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00642","citing_title":"Learn where to Click from Yourself: On-Policy Self-Distillation for GUI Grounding","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13019","citing_title":"See, Point, Refine: Multi-Turn Approach to GUI Grounding with Visual Feedback","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07505","citing_title":"LiteGUI: Distilling Compact GUI Agents with Reinforcement Learning","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07831","citing_title":"Are GUI Agents Focused Enough? Automated Distraction via Semantic-level UI Element Injection","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2605.02630","citing_title":"AutoFocus: Uncertainty-Aware Active Visual Search for GUI Grounding","ref_index":20,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YOB6WEPEIDTSJQM7J245AF4U5Y","json":"https://pith.science/pith/YOB6WEPEIDTSJQM7J245AF4U5Y.json","graph_json":"https://pith.science/api/pith-number/YOB6WEPEIDTSJQM7J245AF4U5Y/graph.json","events_json":"https://pith.science/api/pith-number/YOB6WEPEIDTSJQM7J245AF4U5Y/events.json","paper":"https://pith.science/paper/YOB6WEPE"},"agent_actions":{"view_html":"https://pith.science/pith/YOB6WEPEIDTSJQM7J245AF4U5Y","download_json":"https://pith.science/pith/YOB6WEPEIDTSJQM7J245AF4U5Y.json","view_paper":"https://pith.science/paper/YOB6WEPE","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2504.14239&json=true","fetch_graph":"https://pith.science/api/pith-number/YOB6WEPEIDTSJQM7J245AF4U5Y/graph.json","fetch_events":"https://pith.science/api/pith-number/YOB6WEPEIDTSJQM7J245AF4U5Y/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YOB6WEPEIDTSJQM7J245AF4U5Y/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YOB6WEPEIDTSJQM7J245AF4U5Y/action/storage_attestation","attest_author":"https://pith.science/pith/YOB6WEPEIDTSJQM7J245AF4U5Y/action/author_attestation","sign_citation":"https://pith.science/pith/YOB6WEPEIDTSJQM7J245AF4U5Y/action/citation_signature","submit_replication":"https://pith.science/pith/YOB6WEPEIDTSJQM7J245AF4U5Y/action/replication_record"}},"created_at":"2026-05-18T13:48:13.233016+00:00","updated_at":"2026-05-18T13:48:13.233016+00:00"}