{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:Q4HAQCFSPQYDKAEL64U6WKFBGG","short_pith_number":"pith:Q4HAQCFS","schema_version":"1.0","canonical_sha256":"870e0808b27c3035008bf729eb28a13185f55d7d07c3d4388e8f9fe566d0f3be","source":{"kind":"arxiv","id":"2604.05157","version":2},"attestation_state":"computed","paper":{"title":"IntentScore: Intent-Conditioned Action Evaluation for Computer-Use Agents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A reward model that embeds planning intent scores candidate actions for GUI agents, achieving 97.5 percent pairwise accuracy and lifting success rates by 6.9 points on unseen tasks.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Rongqian Chen, Sizhe Tang, Tian Lan, Weidong Cao, Yu Li, Zeyu Fang","submitted_at":"2026-04-06T20:39:30Z","abstract_excerpt":"Computer-Use Agents (CUAs) leverage large language models to execute GUI operations on desktop environments, yet they generate actions without evaluating action quality, leading to irreversible errors that cascade through subsequent steps. We propose IntentScore, a plan-aware reward model that learns to score candidate actions from 398K offline GUI interaction steps spanning three operating systems. IntentScore trains with two complementary objectives: contrastive alignment for state-action relevance and margin ranking for action correctness. Architecturally, it embeds each candidate's plannin"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2604.05157","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-04-06T20:39:30Z","cross_cats_sorted":[],"title_canon_sha256":"e9bb77a77a4875add3f89238b977fcffd78b3ed4feea3058ca7676af8539984f","abstract_canon_sha256":"1fd3992fcca9df7bd982c234f7abb1e51ff4569acf7c0693ab87640dff200178"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-25T02:02:15.107395Z","signature_b64":"mBkYNXD1tN0XXlBeQx25KasWxR7MXtZXbB2Au1e7n4IIprn+8s6BUqYVMxw0sUYTkgI6Vcpf3Uqix1I6sWYPAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"870e0808b27c3035008bf729eb28a13185f55d7d07c3d4388e8f9fe566d0f3be","last_reissued_at":"2026-05-25T02:02:15.106552Z","signature_status":"signed_v1","first_computed_at":"2026-05-25T02:02:15.106552Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"IntentScore: Intent-Conditioned Action Evaluation for Computer-Use Agents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A reward model that embeds planning intent scores candidate actions for GUI agents, achieving 97.5 percent pairwise accuracy and lifting success rates by 6.9 points on unseen tasks.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Rongqian Chen, Sizhe Tang, Tian Lan, Weidong Cao, Yu Li, Zeyu Fang","submitted_at":"2026-04-06T20:39:30Z","abstract_excerpt":"Computer-Use Agents (CUAs) leverage large language models to execute GUI operations on desktop environments, yet they generate actions without evaluating action quality, leading to irreversible errors that cascade through subsequent steps. We propose IntentScore, a plan-aware reward model that learns to score candidate actions from 398K offline GUI interaction steps spanning three operating systems. IntentScore trains with two complementary objectives: contrastive alignment for state-action relevance and margin ranking for action correctness. Architecturally, it embeds each candidate's plannin"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"IntentScore achieves 97.5% pairwise discrimination accuracy on held-out evaluation. Deployed as a re-ranker for Agent S3 on OSWorld, an environment entirely unseen during training, IntentScore improves task success rate by 6.9 points.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the 398K offline trajectories from three operating systems contain sufficient coverage of the action distributions and intent patterns that will appear when the model is deployed as a re-ranker for new agents on new task distributions.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"IntentScore learns intent-conditioned action scores from offline GUI trajectories and raises task success by 6.9 points on an unseen agent and environment.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A reward model that embeds planning intent scores candidate actions for GUI agents, achieving 97.5 percent pairwise accuracy and lifting success rates by 6.9 points on unseen tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"ea9ac05b4ce35c9543a22d635d7442dc4f68e77f584d43a94087453f04291020"},"source":{"id":"2604.05157","kind":"arxiv","version":2},"verdict":{"id":"d5108aa4-01e5-4039-86a8-2719727f8843","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-10T19:02:27.974934Z","strongest_claim":"IntentScore achieves 97.5% pairwise discrimination accuracy on held-out evaluation. Deployed as a re-ranker for Agent S3 on OSWorld, an environment entirely unseen during training, IntentScore improves task success rate by 6.9 points.","one_line_summary":"IntentScore learns intent-conditioned action scores from offline GUI trajectories and raises task success by 6.9 points on an unseen agent and environment.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the 398K offline trajectories from three operating systems contain sufficient coverage of the action distributions and intent patterns that will appear when the model is deployed as a re-ranker for new agents on new task distributions.","pith_extraction_headline":"A reward model that embeds planning intent scores candidate actions for GUI agents, achieving 97.5 percent pairwise accuracy and lifting success rates by 6.9 points on unseen tasks."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.05157/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2604.05157","created_at":"2026-05-25T02:02:15.106660+00:00"},{"alias_kind":"arxiv_version","alias_value":"2604.05157v2","created_at":"2026-05-25T02:02:15.106660+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.05157","created_at":"2026-05-25T02:02:15.106660+00:00"},{"alias_kind":"pith_short_12","alias_value":"Q4HAQCFSPQYD","created_at":"2026-05-25T02:02:15.106660+00:00"},{"alias_kind":"pith_short_16","alias_value":"Q4HAQCFSPQYDKAEL","created_at":"2026-05-25T02:02:15.106660+00:00"},{"alias_kind":"pith_short_8","alias_value":"Q4HAQCFS","created_at":"2026-05-25T02:02:15.106660+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Q4HAQCFSPQYDKAEL64U6WKFBGG","json":"https://pith.science/pith/Q4HAQCFSPQYDKAEL64U6WKFBGG.json","graph_json":"https://pith.science/api/pith-number/Q4HAQCFSPQYDKAEL64U6WKFBGG/graph.json","events_json":"https://pith.science/api/pith-number/Q4HAQCFSPQYDKAEL64U6WKFBGG/events.json","paper":"https://pith.science/paper/Q4HAQCFS"},"agent_actions":{"view_html":"https://pith.science/pith/Q4HAQCFSPQYDKAEL64U6WKFBGG","download_json":"https://pith.science/pith/Q4HAQCFSPQYDKAEL64U6WKFBGG.json","view_paper":"https://pith.science/paper/Q4HAQCFS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2604.05157&json=true","fetch_graph":"https://pith.science/api/pith-number/Q4HAQCFSPQYDKAEL64U6WKFBGG/graph.json","fetch_events":"https://pith.science/api/pith-number/Q4HAQCFSPQYDKAEL64U6WKFBGG/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Q4HAQCFSPQYDKAEL64U6WKFBGG/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Q4HAQCFSPQYDKAEL64U6WKFBGG/action/storage_attestation","attest_author":"https://pith.science/pith/Q4HAQCFSPQYDKAEL64U6WKFBGG/action/author_attestation","sign_citation":"https://pith.science/pith/Q4HAQCFSPQYDKAEL64U6WKFBGG/action/citation_signature","submit_replication":"https://pith.science/pith/Q4HAQCFSPQYDKAEL64U6WKFBGG/action/replication_record"}},"created_at":"2026-05-25T02:02:15.106660+00:00","updated_at":"2026-05-25T02:02:15.106660+00:00"}