{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:XZCHXYM3RBA4CBE7H2FTHELL22","short_pith_number":"pith:XZCHXYM3","schema_version":"1.0","canonical_sha256":"be447be19b8841c1049f3e8b33916bd6834678ccbfdd7ce26aa98448b79b84a6","source":{"kind":"arxiv","id":"2605.18758","version":1},"attestation_state":"computed","paper":{"title":"OmniGUI: Benchmarking GUI Agents in Omni-Modal Smartphone Environments","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.HC","authors_text":"Bingqian Zhang, Felix Henry, Jiangyou Zhu, Min Chen, Shiyu Huang, Xiaochen Lin, Yangfan","submitted_at":"2026-04-03T08:57:06Z","abstract_excerpt":"Current benchmarks for graphical user interface (GUI) agents predominantly rely on static screenshots. However, real-world smartphone interaction routinely requires agents to process transient audio cues and temporal video dynamics that are tightly coupled with the moment of action. To bridge this gap, we introduce OmniGUI, the first step-level benchmark designed to evaluate GUI agents in omni-modal smartphone environments. OmniGUI provides continuous, interleaved multimodal inputs comprising static images, synchronous audio, and video clips at every action step. The dataset encompasses 709 ex"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.18758","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.HC","submitted_at":"2026-04-03T08:57:06Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"a1bd205e0fc570e53697b118d9334291c378ee6d2d05f6df342674c09ba31cdb","abstract_canon_sha256":"7546db70b0f3f8580c2e7de0a489dab83a0d3dfd83a4fe886a97a43eca50f4b7"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:06:20.368953Z","signature_b64":"T4o3qurLb3slGAYc8gS2vaywL3W111yaLJ+/p91n1exJRoz4fUWIQF5+TKHkugSAHu1gQ7KrIPTnu0VtU2xJCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"be447be19b8841c1049f3e8b33916bd6834678ccbfdd7ce26aa98448b79b84a6","last_reissued_at":"2026-05-20T00:06:20.368061Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:06:20.368061Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"OmniGUI: Benchmarking GUI Agents in Omni-Modal Smartphone Environments","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.HC","authors_text":"Bingqian Zhang, Felix Henry, Jiangyou Zhu, Min Chen, Shiyu Huang, Xiaochen Lin, Yangfan","submitted_at":"2026-04-03T08:57:06Z","abstract_excerpt":"Current benchmarks for graphical user interface (GUI) agents predominantly rely on static screenshots. However, real-world smartphone interaction routinely requires agents to process transient audio cues and temporal video dynamics that are tightly coupled with the moment of action. To bridge this gap, we introduce OmniGUI, the first step-level benchmark designed to evaluate GUI agents in omni-modal smartphone environments. OmniGUI provides continuous, interleaved multimodal inputs comprising static images, synchronous audio, and video clips at every action step. The dataset encompasses 709 ex"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.18758","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.18758/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.18758","created_at":"2026-05-20T00:06:20.368190+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.18758v1","created_at":"2026-05-20T00:06:20.368190+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.18758","created_at":"2026-05-20T00:06:20.368190+00:00"},{"alias_kind":"pith_short_12","alias_value":"XZCHXYM3RBA4","created_at":"2026-05-20T00:06:20.368190+00:00"},{"alias_kind":"pith_short_16","alias_value":"XZCHXYM3RBA4CBE7","created_at":"2026-05-20T00:06:20.368190+00:00"},{"alias_kind":"pith_short_8","alias_value":"XZCHXYM3","created_at":"2026-05-20T00:06:20.368190+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XZCHXYM3RBA4CBE7H2FTHELL22","json":"https://pith.science/pith/XZCHXYM3RBA4CBE7H2FTHELL22.json","graph_json":"https://pith.science/api/pith-number/XZCHXYM3RBA4CBE7H2FTHELL22/graph.json","events_json":"https://pith.science/api/pith-number/XZCHXYM3RBA4CBE7H2FTHELL22/events.json","paper":"https://pith.science/paper/XZCHXYM3"},"agent_actions":{"view_html":"https://pith.science/pith/XZCHXYM3RBA4CBE7H2FTHELL22","download_json":"https://pith.science/pith/XZCHXYM3RBA4CBE7H2FTHELL22.json","view_paper":"https://pith.science/paper/XZCHXYM3","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.18758&json=true","fetch_graph":"https://pith.science/api/pith-number/XZCHXYM3RBA4CBE7H2FTHELL22/graph.json","fetch_events":"https://pith.science/api/pith-number/XZCHXYM3RBA4CBE7H2FTHELL22/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XZCHXYM3RBA4CBE7H2FTHELL22/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XZCHXYM3RBA4CBE7H2FTHELL22/action/storage_attestation","attest_author":"https://pith.science/pith/XZCHXYM3RBA4CBE7H2FTHELL22/action/author_attestation","sign_citation":"https://pith.science/pith/XZCHXYM3RBA4CBE7H2FTHELL22/action/citation_signature","submit_replication":"https://pith.science/pith/XZCHXYM3RBA4CBE7H2FTHELL22/action/replication_record"}},"created_at":"2026-05-20T00:06:20.368190+00:00","updated_at":"2026-05-20T00:06:20.368190+00:00"}