{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:QHECGYVLDT7OHOERAC35FBGPR7","short_pith_number":"pith:QHECGYVL","canonical_record":{"source":{"id":"2512.12634","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2025-12-14T10:41:39Z","cross_cats_sorted":[],"title_canon_sha256":"446390125615900b34aef5e039d642b5de22165a8a8970ee94f2aacaa577efac","abstract_canon_sha256":"1974e3286eef3c7f833714c06a065c85214ebf5b5ac70cc3196a453ce2f2dbe1"},"schema_version":"1.0"},"canonical_sha256":"81c82362ab1cfee3b89100b7d284cf8fdd2331bbd8ad4e3c50084163ad07cdbe","source":{"kind":"arxiv","id":"2512.12634","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2512.12634","created_at":"2026-05-18T03:09:32Z"},{"alias_kind":"arxiv_version","alias_value":"2512.12634v3","created_at":"2026-05-18T03:09:32Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.12634","created_at":"2026-05-18T03:09:32Z"},{"alias_kind":"pith_short_12","alias_value":"QHECGYVLDT7O","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"QHECGYVLDT7OHOER","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"QHECGYVL","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:QHECGYVLDT7OHOERAC35FBGPR7","target":"record","payload":{"canonical_record":{"source":{"id":"2512.12634","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2025-12-14T10:41:39Z","cross_cats_sorted":[],"title_canon_sha256":"446390125615900b34aef5e039d642b5de22165a8a8970ee94f2aacaa577efac","abstract_canon_sha256":"1974e3286eef3c7f833714c06a065c85214ebf5b5ac70cc3196a453ce2f2dbe1"},"schema_version":"1.0"},"canonical_sha256":"81c82362ab1cfee3b89100b7d284cf8fdd2331bbd8ad4e3c50084163ad07cdbe","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:09:32.624022Z","signature_b64":"6nWwmKbPfJ1sROQLUoGx6C0s6jUYYJY9BqpNhXPSZrPTwz8LKtdWUQj9Kqr4XA7x1idXc+TE1hN5ieVfZk8kCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"81c82362ab1cfee3b89100b7d284cf8fdd2331bbd8ad4e3c50084163ad07cdbe","last_reissued_at":"2026-05-18T03:09:32.623517Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:09:32.623517Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2512.12634","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:32Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Pe9S7eLqB10TDgyYi+b5lNPR7u6Jsd061hKad2sFLkLFJ6J/cD1kFi+3gJEjjlU41465XDhELCA9/D1WuwAFCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-23T19:29:18.470187Z"},"content_sha256":"d8528eecc307aee0f1c0b732d6797af13021246866a880029a2da6e82f82a892","schema_version":"1.0","event_id":"sha256:d8528eecc307aee0f1c0b732d6797af13021246866a880029a2da6e82f82a892"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:QHECGYVLDT7OHOERAC35FBGPR7","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"MobiBench: Multi-Branch, Modular Benchmark for Mobile GUI Agents","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"MobiBench provides a modular offline benchmark for mobile GUI agents that matches human evaluators at 94.72 percent agreement.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Byeongung Jo, Insik Shin, Jaeyoung Wi, Joo Hyung Lee, Sangeun Oh, Seungwoo Baek, Sunjae Lee, Tae Hoon Min, Youngmin Im","submitted_at":"2025-12-14T10:41:39Z","abstract_excerpt":"Mobile GUI Agents, AI agents capable of interacting with mobile applications on behalf of users, have the potential to transform human computer interaction. However, current evaluation practices for GUI agents face two fundamental limitations. First, they either rely on single path offline benchmarks or online live benchmarks. Offline benchmarks using static, single path annotated datasets unfairly penalize valid alternative actions, while online benchmarks suffer from poor scalability and reproducibility due to the dynamic and unpredictable nature of live evaluation. Second, existing benchmar"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"MobiBench achieves 94.72 percent agreement with human evaluators, on par with carefully engineered online benchmarks, while preserving the scalability and reproducibility of static offline benchmarks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the multi-path annotations comprehensively capture all valid alternative actions that human evaluators would accept, without systematic omissions that could affect agreement rates.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"MobiBench is the first modular multi-path offline benchmark for mobile GUI agents, achieving 94.72% agreement with human evaluators while allowing component-level analysis.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"MobiBench provides a modular offline benchmark for mobile GUI agents that matches human evaluators at 94.72 percent agreement.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"97188cebcd3704b64cd3f0ab059e7c3a8f0183e66160efd4e44cd163bbcb37ff"},"source":{"id":"2512.12634","kind":"arxiv","version":3},"verdict":{"id":"bf3c66b4-5c2f-409c-8054-7e3527f8805c","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T22:56:01.658808Z","strongest_claim":"MobiBench achieves 94.72 percent agreement with human evaluators, on par with carefully engineered online benchmarks, while preserving the scalability and reproducibility of static offline benchmarks.","one_line_summary":"MobiBench is the first modular multi-path offline benchmark for mobile GUI agents, achieving 94.72% agreement with human evaluators while allowing component-level analysis.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the multi-path annotations comprehensively capture all valid alternative actions that human evaluators would accept, without systematic omissions that could affect agreement rates.","pith_extraction_headline":"MobiBench provides a modular offline benchmark for mobile GUI agents that matches human evaluators at 94.72 percent agreement."},"references":{"count":63,"sample":[{"doi":"","year":2025,"title":"Agent S2: A compositional generalist-specialist framework for computer use agents","work_id":"56466266-2822-44dd-a6e9-6036285f881e","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","ref_index":2,"cited_arxiv_id":"2005.14165","is_internal_anchor":true},{"doi":"","year":2021,"title":"Andrea Burns, Deniz Arsan, Sanjna Agrawal, Ranjitha Kumar, Kate Saenko, and Bryan A Plummer. 2021. Mobile app tasks with itera- tive feedback (motif): Addressing task feasibility in interactive visual","work_id":"7eab30d2-619d-4727-baf1-031a621eebaf","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"arXiv preprint arXiv:2407.17490 , year=","work_id":"07ffe67e-3e5a-406c-84fe-3de84f8dd21d","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"The BrowserGym ecosystem for web agent research.arXiv preprint arXiv:2412.05467","work_id":"f7dd22e4-8dc0-4a62-a313-cb0712e5d7dc","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":63,"snapshot_sha256":"2b283ac1b32241950ac6c6d83290ef2ddacf547ea27c5a0dd80c9b8a45db6f41","internal_anchors":11},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"bf3c66b4-5c2f-409c-8054-7e3527f8805c"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:32Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"7CnDTxMCioTZyfrqzF/BAVZY0YstRjirtfn9tz3Y9Y+L/tPQ1/TqyPwE+9+inR+/lTEsirX1zkzpmYY3m006Ag==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-23T19:29:18.471229Z"},"content_sha256":"a31c176046cc011d608492174d77d7530edb926c051188ac24f0bbfbb6e30eef","schema_version":"1.0","event_id":"sha256:a31c176046cc011d608492174d77d7530edb926c051188ac24f0bbfbb6e30eef"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/QHECGYVLDT7OHOERAC35FBGPR7/bundle.json","state_url":"https://pith.science/pith/QHECGYVLDT7OHOERAC35FBGPR7/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/QHECGYVLDT7OHOERAC35FBGPR7/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-23T19:29:18Z","links":{"resolver":"https://pith.science/pith/QHECGYVLDT7OHOERAC35FBGPR7","bundle":"https://pith.science/pith/QHECGYVLDT7OHOERAC35FBGPR7/bundle.json","state":"https://pith.science/pith/QHECGYVLDT7OHOERAC35FBGPR7/state.json","well_known_bundle":"https://pith.science/.well-known/pith/QHECGYVLDT7OHOERAC35FBGPR7/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:QHECGYVLDT7OHOERAC35FBGPR7","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"1974e3286eef3c7f833714c06a065c85214ebf5b5ac70cc3196a453ce2f2dbe1","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2025-12-14T10:41:39Z","title_canon_sha256":"446390125615900b34aef5e039d642b5de22165a8a8970ee94f2aacaa577efac"},"schema_version":"1.0","source":{"id":"2512.12634","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2512.12634","created_at":"2026-05-18T03:09:32Z"},{"alias_kind":"arxiv_version","alias_value":"2512.12634v3","created_at":"2026-05-18T03:09:32Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.12634","created_at":"2026-05-18T03:09:32Z"},{"alias_kind":"pith_short_12","alias_value":"QHECGYVLDT7O","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"QHECGYVLDT7OHOER","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"QHECGYVL","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:a31c176046cc011d608492174d77d7530edb926c051188ac24f0bbfbb6e30eef","target":"graph","created_at":"2026-05-18T03:09:32Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"MobiBench achieves 94.72 percent agreement with human evaluators, on par with carefully engineered online benchmarks, while preserving the scalability and reproducibility of static offline benchmarks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the multi-path annotations comprehensively capture all valid alternative actions that human evaluators would accept, without systematic omissions that could affect agreement rates."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MobiBench is the first modular multi-path offline benchmark for mobile GUI agents, achieving 94.72% agreement with human evaluators while allowing component-level analysis."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"MobiBench provides a modular offline benchmark for mobile GUI agents that matches human evaluators at 94.72 percent agreement."}],"snapshot_sha256":"97188cebcd3704b64cd3f0ab059e7c3a8f0183e66160efd4e44cd163bbcb37ff"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Mobile GUI Agents, AI agents capable of interacting with mobile applications on behalf of users, have the potential to transform human computer interaction. However, current evaluation practices for GUI agents face two fundamental limitations. First, they either rely on single path offline benchmarks or online live benchmarks. Offline benchmarks using static, single path annotated datasets unfairly penalize valid alternative actions, while online benchmarks suffer from poor scalability and reproducibility due to the dynamic and unpredictable nature of live evaluation. Second, existing benchmar","authors_text":"Byeongung Jo, Insik Shin, Jaeyoung Wi, Joo Hyung Lee, Sangeun Oh, Seungwoo Baek, Sunjae Lee, Tae Hoon Min, Youngmin Im","cross_cats":[],"headline":"MobiBench provides a modular offline benchmark for mobile GUI agents that matches human evaluators at 94.72 percent agreement.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2025-12-14T10:41:39Z","title":"MobiBench: Multi-Branch, Modular Benchmark for Mobile GUI Agents"},"references":{"count":63,"internal_anchors":11,"resolved_work":63,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Agent S2: A compositional generalist-specialist framework for computer use agents","work_id":"56466266-2822-44dd-a6e9-6036285f881e","year":2025},{"cited_arxiv_id":"2005.14165","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Andrea Burns, Deniz Arsan, Sanjna Agrawal, Ranjitha Kumar, Kate Saenko, and Bryan A Plummer. 2021. Mobile app tasks with itera- tive feedback (motif): Addressing task feasibility in interactive visual","work_id":"7eab30d2-619d-4727-baf1-031a621eebaf","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"arXiv preprint arXiv:2407.17490 , year=","work_id":"07ffe67e-3e5a-406c-84fe-3de84f8dd21d","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"The BrowserGym ecosystem for web agent research.arXiv preprint arXiv:2412.05467","work_id":"f7dd22e4-8dc0-4a62-a313-cb0712e5d7dc","year":2024}],"snapshot_sha256":"2b283ac1b32241950ac6c6d83290ef2ddacf547ea27c5a0dd80c9b8a45db6f41"},"source":{"id":"2512.12634","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-16T22:56:01.658808Z","id":"bf3c66b4-5c2f-409c-8054-7e3527f8805c","model_set":{"reader":"grok-4.3"},"one_line_summary":"MobiBench is the first modular multi-path offline benchmark for mobile GUI agents, achieving 94.72% agreement with human evaluators while allowing component-level analysis.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"MobiBench provides a modular offline benchmark for mobile GUI agents that matches human evaluators at 94.72 percent agreement.","strongest_claim":"MobiBench achieves 94.72 percent agreement with human evaluators, on par with carefully engineered online benchmarks, while preserving the scalability and reproducibility of static offline benchmarks.","weakest_assumption":"That the multi-path annotations comprehensively capture all valid alternative actions that human evaluators would accept, without systematic omissions that could affect agreement rates."}},"verdict_id":"bf3c66b4-5c2f-409c-8054-7e3527f8805c"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:d8528eecc307aee0f1c0b732d6797af13021246866a880029a2da6e82f82a892","target":"record","created_at":"2026-05-18T03:09:32Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"1974e3286eef3c7f833714c06a065c85214ebf5b5ac70cc3196a453ce2f2dbe1","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2025-12-14T10:41:39Z","title_canon_sha256":"446390125615900b34aef5e039d642b5de22165a8a8970ee94f2aacaa577efac"},"schema_version":"1.0","source":{"id":"2512.12634","kind":"arxiv","version":3}},"canonical_sha256":"81c82362ab1cfee3b89100b7d284cf8fdd2331bbd8ad4e3c50084163ad07cdbe","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"81c82362ab1cfee3b89100b7d284cf8fdd2331bbd8ad4e3c50084163ad07cdbe","first_computed_at":"2026-05-18T03:09:32.623517Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:09:32.623517Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"6nWwmKbPfJ1sROQLUoGx6C0s6jUYYJY9BqpNhXPSZrPTwz8LKtdWUQj9Kqr4XA7x1idXc+TE1hN5ieVfZk8kCQ==","signature_status":"signed_v1","signed_at":"2026-05-18T03:09:32.624022Z","signed_message":"canonical_sha256_bytes"},"source_id":"2512.12634","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:d8528eecc307aee0f1c0b732d6797af13021246866a880029a2da6e82f82a892","sha256:a31c176046cc011d608492174d77d7530edb926c051188ac24f0bbfbb6e30eef"],"state_sha256":"9c2a783e3cfc6b73325320a9da1cb2762432eaf143f366f20e8309b0388bdfc0"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"AFmcuLH5xnTk6EPXJcFCsen4E89FvBA1D4FkgzDOVmC8SeZbJYJJTAclFNQI1dZb9ADRTv47874Vaqhqcv7ABg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-23T19:29:18.475947Z","bundle_sha256":"c3ba7fa17ffd09f2e52cf2bcc8416049edeb8fe450ef0e66b8b7ff0641babe87"}}