{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:22VLDJHSQ4NTENGNN47KGKPKOT","short_pith_number":"pith:22VLDJHS","schema_version":"1.0","canonical_sha256":"d6aab1a4f2871b3234cd6f3ea329ea74d00e8469d963933930fedef48e27e123","source":{"kind":"arxiv","id":"2504.00906","version":1},"attestation_state":"computed","paper":{"title":"Agent S2: A Compositional Generalist-Specialist Framework for Computer Use Agents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL","cs.CV","cs.LG"],"primary_cat":"cs.AI","authors_text":"Ang Li, Jiachen Yang, Kyle Wong, Saaket Agashe, Vincent Tu, Xin Eric Wang","submitted_at":"2025-04-01T15:40:27Z","abstract_excerpt":"Computer use agents automate digital tasks by directly interacting with graphical user interfaces (GUIs) on computers and mobile devices, offering significant potential to enhance human productivity by completing an open-ended space of user queries. However, current agents face significant challenges: imprecise grounding of GUI elements, difficulties with long-horizon task planning, and performance bottlenecks from relying on single generalist models for diverse cognitive tasks. To this end, we introduce Agent S2, a novel compositional framework that delegates cognitive responsibilities across"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2504.00906","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-04-01T15:40:27Z","cross_cats_sorted":["cs.CL","cs.CV","cs.LG"],"title_canon_sha256":"b50779b0236733b278834f6e3f22edfee49582a0da9cee22d376e175804d67c5","abstract_canon_sha256":"ca91af6284027333c7efb511fe06d9e17078018ce1c14ade8db33ae0293772ff"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T15:33:55.609715Z","signature_b64":"jmJcZeYz0FusfmstgrdKPYJwH/A3OtxdDpglj7C8rRX5lLbFLUN/SGJckLnfJnNjhR8TocQMIcB3/Y+1cQyoAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d6aab1a4f2871b3234cd6f3ea329ea74d00e8469d963933930fedef48e27e123","last_reissued_at":"2026-05-22T15:33:55.606740Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T15:33:55.606740Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Agent S2: A Compositional Generalist-Specialist Framework for Computer Use Agents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL","cs.CV","cs.LG"],"primary_cat":"cs.AI","authors_text":"Ang Li, Jiachen Yang, Kyle Wong, Saaket Agashe, Vincent Tu, Xin Eric Wang","submitted_at":"2025-04-01T15:40:27Z","abstract_excerpt":"Computer use agents automate digital tasks by directly interacting with graphical user interfaces (GUIs) on computers and mobile devices, offering significant potential to enhance human productivity by completing an open-ended space of user queries. However, current agents face significant challenges: imprecise grounding of GUI elements, difficulties with long-horizon task planning, and performance bottlenecks from relying on single generalist models for diverse cognitive tasks. To this end, we introduce Agent S2, a novel compositional framework that delegates cognitive responsibilities across"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2504.00906","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2504.00906/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2504.00906","created_at":"2026-05-22T15:33:55.606876+00:00"},{"alias_kind":"arxiv_version","alias_value":"2504.00906v1","created_at":"2026-05-22T15:33:55.606876+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.00906","created_at":"2026-05-22T15:33:55.606876+00:00"},{"alias_kind":"pith_short_12","alias_value":"22VLDJHSQ4NT","created_at":"2026-05-22T15:33:55.606876+00:00"},{"alias_kind":"pith_short_16","alias_value":"22VLDJHSQ4NTENGN","created_at":"2026-05-22T15:33:55.606876+00:00"},{"alias_kind":"pith_short_8","alias_value":"22VLDJHS","created_at":"2026-05-22T15:33:55.606876+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":22,"internal_anchor_count":22,"sample":[{"citing_arxiv_id":"2505.03364","citing_title":"DroidRetriever: A Transparent and Steerable Automation System for Collaborative Mobile Information Seeking","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2505.10887","citing_title":"InfantAgent-Next: A Multimodal Generalist Agent for Automated Computer Interaction","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2506.16042","citing_title":"OSWorld-Human: Benchmarking the Efficiency of Computer-Use Agents","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18652","citing_title":"MementoGUI: Learning Agentic Multimodal Memory Control for Long-Horizon GUI Agents","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19769","citing_title":"OpenComputer: Verifiable Software Worlds for Computer-Use Agents","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2509.07553","citing_title":"VeriOS: Query-Driven Proactive Human-Agent-GUI Interaction for Trustworthy OS Agents","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2504.14239","citing_title":"InfiGUI-R1: Advancing Multimodal GUI Agents from Reactive Actors to Deliberative Reasoners","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2510.24168","citing_title":"MGA: Memory-Driven GUI Agent for Observation-Centric Interaction","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2507.05791","citing_title":"GTA1: GUI Test-time Scaling Agent","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2512.10371","citing_title":"AgentProg: Empowering Long-Horizon GUI Agents with Program-Guided Context Management","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2512.12634","citing_title":"MobiBench: Multi-Branch, Modular Benchmark for Mobile GUI Agents","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2602.12430","citing_title":"Agent Skills for Large Language Models: Architecture, Acquisition, Security, and the Path Forward","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12501","citing_title":"Covering Human Action Space for Computer Use: Data Synthesis and Benchmark","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12481","citing_title":"ToolCUA: Towards Optimal GUI-Tool Path Orchestration for Computer Use Agents","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27955","citing_title":"GUI Agents with Reinforcement Learning: Toward Digital Inhabitants","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24348","citing_title":"OS-SPEAR: A Toolkit for the Safety, Performance,Efficiency, and Robustness Analysis of OS Agents","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13019","citing_title":"See, Point, Refine: Multi-Turn Approach to GUI Grounding with Visual Feedback","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07505","citing_title":"LiteGUI: Distilling Compact GUI Agents with Reinforcement Learning","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05157","citing_title":"IntentScore: Intent-Conditioned Action Evaluation for Computer-Use Agents","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14113","citing_title":"UI-Zoomer: Uncertainty-Driven Adaptive Zoom-In for GUI Grounding","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2604.16007","citing_title":"MemExplorer: Navigating the Heterogeneous Memory Design Space for Agentic Inference NPUs","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2604.21375","citing_title":"VLAA-GUI: Knowing When to Stop, Recover, and Search, A Modular Framework for GUI Automation","ref_index":2,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/22VLDJHSQ4NTENGNN47KGKPKOT","json":"https://pith.science/pith/22VLDJHSQ4NTENGNN47KGKPKOT.json","graph_json":"https://pith.science/api/pith-number/22VLDJHSQ4NTENGNN47KGKPKOT/graph.json","events_json":"https://pith.science/api/pith-number/22VLDJHSQ4NTENGNN47KGKPKOT/events.json","paper":"https://pith.science/paper/22VLDJHS"},"agent_actions":{"view_html":"https://pith.science/pith/22VLDJHSQ4NTENGNN47KGKPKOT","download_json":"https://pith.science/pith/22VLDJHSQ4NTENGNN47KGKPKOT.json","view_paper":"https://pith.science/paper/22VLDJHS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2504.00906&json=true","fetch_graph":"https://pith.science/api/pith-number/22VLDJHSQ4NTENGNN47KGKPKOT/graph.json","fetch_events":"https://pith.science/api/pith-number/22VLDJHSQ4NTENGNN47KGKPKOT/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/22VLDJHSQ4NTENGNN47KGKPKOT/action/timestamp_anchor","attest_storage":"https://pith.science/pith/22VLDJHSQ4NTENGNN47KGKPKOT/action/storage_attestation","attest_author":"https://pith.science/pith/22VLDJHSQ4NTENGNN47KGKPKOT/action/author_attestation","sign_citation":"https://pith.science/pith/22VLDJHSQ4NTENGNN47KGKPKOT/action/citation_signature","submit_replication":"https://pith.science/pith/22VLDJHSQ4NTENGNN47KGKPKOT/action/replication_record"}},"created_at":"2026-05-22T15:33:55.606876+00:00","updated_at":"2026-05-22T15:33:55.606876+00:00"}