{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:G7K23BUA4CTJTTX5N3UHNJC4LZ","short_pith_number":"pith:G7K23BUA","schema_version":"1.0","canonical_sha256":"37d5ad8680e0a699cefd6ee876a45c5e7c5668559ce447c4eb2f1beee9f98cf6","source":{"kind":"arxiv","id":"2605.18583","version":1},"attestation_state":"computed","paper":{"title":"Overeager Coding Agents: Measuring Out-of-Scope Actions on Benign Tasks","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.CR"],"primary_cat":"cs.SE","authors_text":"Gelei Deng, Leo Yu Zhang, Yanjun Zhang, Yi Liu, Ying Zhang, Yubin Qu, Yuekang Li","submitted_at":"2026-05-18T16:00:41Z","abstract_excerpt":"Coding agents now run autonomously with shell, file, and network privileges. When a user issues a benign request, the agent sometimes does more than asked: it deletes unrelated files, wipes a stale credentials backup, or rewrites configuration the user never mentioned. We call these scope expansions overeager actions, an authorization problem distinct from capability failures, prompt injection, or sandbox escapes.\n  We present OverEager-Gen, a benchmark dedicated to overeager behavior on benign tasks. Building it surfaces a measurement-validity issue: if a benchmark spells out the authorized s"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.18583","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-18T16:00:41Z","cross_cats_sorted":["cs.AI","cs.CL","cs.CR"],"title_canon_sha256":"d16c6844d02a61e8b4b048b8f98aa1c6b8c53a3a47f352b829888185d3065a9e","abstract_canon_sha256":"14537eb2ee94313e079e21ae4df82586ecd7b2f7d8e09c7d6bba109adc3f736d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:06:09.079196Z","signature_b64":"6p/m+mjIyIuih4bW5L4+jnAk3Aw0eFrUg7V5G4D9JgNY6eSkiQGJ0dfW85quv6o2rbnFGXYz7w5VJT2gxaGHBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"37d5ad8680e0a699cefd6ee876a45c5e7c5668559ce447c4eb2f1beee9f98cf6","last_reissued_at":"2026-05-20T00:06:09.078451Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:06:09.078451Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Overeager Coding Agents: Measuring Out-of-Scope Actions on Benign Tasks","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.CR"],"primary_cat":"cs.SE","authors_text":"Gelei Deng, Leo Yu Zhang, Yanjun Zhang, Yi Liu, Ying Zhang, Yubin Qu, Yuekang Li","submitted_at":"2026-05-18T16:00:41Z","abstract_excerpt":"Coding agents now run autonomously with shell, file, and network privileges. When a user issues a benign request, the agent sometimes does more than asked: it deletes unrelated files, wipes a stale credentials backup, or rewrites configuration the user never mentioned. We call these scope expansions overeager actions, an authorization problem distinct from capability failures, prompt injection, or sandbox escapes.\n  We present OverEager-Gen, a benchmark dedicated to overeager behavior on benign tasks. Building it surfaces a measurement-validity issue: if a benchmark spells out the authorized s"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.18583","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.18583/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"claim_evidence","ran_at":"2026-05-20T00:01:59.304162Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"f830a06d7edec6ba00cc3ee776edfadce8c2d8ed225138d727df4623927a3758"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.18583","created_at":"2026-05-20T00:06:09.078563+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.18583v1","created_at":"2026-05-20T00:06:09.078563+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.18583","created_at":"2026-05-20T00:06:09.078563+00:00"},{"alias_kind":"pith_short_12","alias_value":"G7K23BUA4CTJ","created_at":"2026-05-20T00:06:09.078563+00:00"},{"alias_kind":"pith_short_16","alias_value":"G7K23BUA4CTJTTX5","created_at":"2026-05-20T00:06:09.078563+00:00"},{"alias_kind":"pith_short_8","alias_value":"G7K23BUA","created_at":"2026-05-20T00:06:09.078563+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/G7K23BUA4CTJTTX5N3UHNJC4LZ","json":"https://pith.science/pith/G7K23BUA4CTJTTX5N3UHNJC4LZ.json","graph_json":"https://pith.science/api/pith-number/G7K23BUA4CTJTTX5N3UHNJC4LZ/graph.json","events_json":"https://pith.science/api/pith-number/G7K23BUA4CTJTTX5N3UHNJC4LZ/events.json","paper":"https://pith.science/paper/G7K23BUA"},"agent_actions":{"view_html":"https://pith.science/pith/G7K23BUA4CTJTTX5N3UHNJC4LZ","download_json":"https://pith.science/pith/G7K23BUA4CTJTTX5N3UHNJC4LZ.json","view_paper":"https://pith.science/paper/G7K23BUA","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.18583&json=true","fetch_graph":"https://pith.science/api/pith-number/G7K23BUA4CTJTTX5N3UHNJC4LZ/graph.json","fetch_events":"https://pith.science/api/pith-number/G7K23BUA4CTJTTX5N3UHNJC4LZ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/G7K23BUA4CTJTTX5N3UHNJC4LZ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/G7K23BUA4CTJTTX5N3UHNJC4LZ/action/storage_attestation","attest_author":"https://pith.science/pith/G7K23BUA4CTJTTX5N3UHNJC4LZ/action/author_attestation","sign_citation":"https://pith.science/pith/G7K23BUA4CTJTTX5N3UHNJC4LZ/action/citation_signature","submit_replication":"https://pith.science/pith/G7K23BUA4CTJTTX5N3UHNJC4LZ/action/replication_record"}},"created_at":"2026-05-20T00:06:09.078563+00:00","updated_at":"2026-05-20T00:06:09.078563+00:00"}