{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:RWN7GKM5UGUN4YJWNKD4K5VZJF","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"92e4a4b010941537fa510bc1d347dccda9f84b9477ca68019c2669b6979fecd9","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-16T08:51:57Z","title_canon_sha256":"fde63058e5b2c82fcb5a9970e45e6402dbd520321982627d335c5e54f6f532a0"},"schema_version":"1.0","source":{"id":"2605.16883","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.16883","created_at":"2026-05-20T00:03:28Z"},{"alias_kind":"arxiv_version","alias_value":"2605.16883v1","created_at":"2026-05-20T00:03:28Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.16883","created_at":"2026-05-20T00:03:28Z"},{"alias_kind":"pith_short_12","alias_value":"RWN7GKM5UGUN","created_at":"2026-05-20T00:03:28Z"},{"alias_kind":"pith_short_16","alias_value":"RWN7GKM5UGUN4YJW","created_at":"2026-05-20T00:03:28Z"},{"alias_kind":"pith_short_8","alias_value":"RWN7GKM5","created_at":"2026-05-20T00:03:28Z"}],"graph_snapshots":[{"event_id":"sha256:a012bf74f886d16cf30bbeb611b8c4410d8ae98e225e9b0bc30e6043746ed853","target":"graph","created_at":"2026-05-20T00:03:28Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"SE-GA achieves state-of-the-art performance, reaching success rates of 89.0% on ScreenSpot and 75.8% on the challenging AndroidControl-High dataset with significant improvements on AndroidWorld."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The data collected by TTME during inference is of sufficient quality and diversity to stabilize and enhance the foundational policy through the MASE training pipeline without introducing harmful biases or catastrophic forgetting."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SE-GA combines Test-Time Memory Extension for dynamic context retrieval with Memory-Augmented Self-Evolution training to reach 89.0% on ScreenSpot and 75.8% on AndroidControl-High."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"The SE-GA framework lets GUI agents self-evolve by retrieving memories at test time and retraining on the resulting data to reach higher success rates on multi-step tasks."}],"snapshot_sha256":"1bbda7a182e17ecfb51bb25a70877a330db510301e37b5c4fdbe5c032e31ca38"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"a13899b2b8dfde6e953f8550e9fa98cba1fa7e1b5b1e2da97f36e51dd1e2f237"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T21:01:19.201708Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T20:40:54.801548Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"claim_evidence","ran_at":"2026-05-19T18:41:56.289167Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-19T18:33:26.366699Z","status":"skipped","version":"1.0.0"}],"endpoint":"/pith/2605.16883/integrity.json","findings":[],"snapshot_sha256":"0d87c8cd8a2217a0f948ba6f287089e138ab5d857a66137e5bc840eab86299bb","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Autonomous Graphical User Interface (GUI) agents often struggle with multi-step tasks due to constrained context windows and static policies that fail to adapt to dynamic environments. To address these limitations, this work proposes the Self-Evolving GUI Agent (SE-GA), a novel framework that integrates hierarchical memory structures with an iterative self-improvement mechanism. At the core of our approach is Test-Time Memory Extension (TTME), which facilitates long-term planning by dynamically retrieving episodic, semantic, and experiential memories to provide salient contexts during inferenc","authors_text":"Lanjun Wang, Shilong Jin, Zhuosheng Zhang","cross_cats":[],"headline":"The SE-GA framework lets GUI agents self-evolve by retrieving memories at test time and retraining on the resulting data to reach higher success rates on multi-step tasks.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-16T08:51:57Z","title":"SE-GA: Memory-Augmented Self-Evolution for GUI Agents"},"references":{"count":54,"internal_anchors":23,"resolved_work":54,"sample":[{"cited_arxiv_id":"1707.01495","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Charles Beattie, Thomas Köppe, Edgar A","work_id":"a7ba78be-3bef-4f48-b954-7a5dfbc0d5b6","year":2018},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Our 3.5 models and computer use, 2024","work_id":"eee73463-da91-4974-aa09-8c42d946f365","year":2024},{"cited_arxiv_id":"2502.13923","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","year":2025},{"cited_arxiv_id":"","doi":"10.18653/v1/2025.findings-acl.110","is_internal_anchor":false,"ref_index":4,"title":"Amex: Android multi-annotation expo dataset for mobile gui agents","work_id":"f943a109-e446-4ece-ac87-cb331992c240","year":2025},{"cited_arxiv_id":"2412.05271","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","work_id":"ee70bdc8-4656-4849-ada7-ce42a2278d70","year":2024}],"snapshot_sha256":"4f6dff3a4f0f0641ffc63905629654b01ecdd40b037ba5ecd2815d04955d3152"},"source":{"id":"2605.16883","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-19T20:34:46.917590Z","id":"dbc3ffac-ec5c-483f-b00b-40406036b1cd","model_set":{"reader":"grok-4.3"},"one_line_summary":"SE-GA combines Test-Time Memory Extension for dynamic context retrieval with Memory-Augmented Self-Evolution training to reach 89.0% on ScreenSpot and 75.8% on AndroidControl-High.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"The SE-GA framework lets GUI agents self-evolve by retrieving memories at test time and retraining on the resulting data to reach higher success rates on multi-step tasks.","strongest_claim":"SE-GA achieves state-of-the-art performance, reaching success rates of 89.0% on ScreenSpot and 75.8% on the challenging AndroidControl-High dataset with significant improvements on AndroidWorld.","weakest_assumption":"The data collected by TTME during inference is of sufficient quality and diversity to stabilize and enhance the foundational policy through the MASE training pipeline without introducing harmful biases or catastrophic forgetting."}},"verdict_id":"dbc3ffac-ec5c-483f-b00b-40406036b1cd"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:b15d8dcd268284bd3ce6c3e716b73b5e22d350d3fecfca3909041a11ac593108","target":"record","created_at":"2026-05-20T00:03:28Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"92e4a4b010941537fa510bc1d347dccda9f84b9477ca68019c2669b6979fecd9","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-16T08:51:57Z","title_canon_sha256":"fde63058e5b2c82fcb5a9970e45e6402dbd520321982627d335c5e54f6f532a0"},"schema_version":"1.0","source":{"id":"2605.16883","kind":"arxiv","version":1}},"canonical_sha256":"8d9bf3299da1a8de61366a87c576b9495d9a8a544022a02789be91016fe977cd","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"8d9bf3299da1a8de61366a87c576b9495d9a8a544022a02789be91016fe977cd","first_computed_at":"2026-05-20T00:03:28.132282Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:03:28.132282Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"0GH8P7B4xorLmzx776o39D7y7Ugg7Mqzqgqf4doDqeuQ//yxodGLY8XFJw3HmOlTHDdwst3AclaqRMiciAIODw==","signature_status":"signed_v1","signed_at":"2026-05-20T00:03:28.133038Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.16883","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:b15d8dcd268284bd3ce6c3e716b73b5e22d350d3fecfca3909041a11ac593108","sha256:a012bf74f886d16cf30bbeb611b8c4410d8ae98e225e9b0bc30e6043746ed853"],"state_sha256":"af2def82fd6dc45d74223e08487a93ae86fd496977101ba4229a3e1b76da496f"}