{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:WRJXYT43L5CHOHEVXRH55FQBDG","short_pith_number":"pith:WRJXYT43","schema_version":"1.0","canonical_sha256":"b4537c4f9b5f44771c95bc4fde9601198855fb927d0086718ea87b4890ea082b","source":{"kind":"arxiv","id":"2605.13139","version":1},"attestation_state":"computed","paper":{"title":"SWE-Cycle: Benchmarking Code Agents across the Complete Issue Resolution Cycle","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Code agents show sharply lower success rates when handling complete issue resolution autonomously versus in isolated subtasks.","cross_cats":[],"primary_cat":"cs.SE","authors_text":"Hao Guan, Kangning Zhang, Lingyue Fu, Lin Qiu, Shao Zhang, Weinan Zhang, Weiwen Liu, Xuezhi Cao, Xunliang Cai, Yaoming Zhu, Yong Yu","submitted_at":"2026-05-13T08:05:16Z","abstract_excerpt":"As autonomous code agents move toward end-to-end software development, evaluating their practical autonomy becomes critical. Current benchmarks hide friction by testing agents in pre-configured environments, and their static evaluation pipelines frequently fail when parsing fully autonomous trajectories. We address these limitations with SWE-Cycle, a benchmark of 489 rigorously filtered instances. SWE-Cycle evaluates agents across three isolated tasks, including environment reconstruction, code implementation, and verification test generation, as well as an end-to-end FullCycle task that integ"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.13139","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-13T08:05:16Z","cross_cats_sorted":[],"title_canon_sha256":"b6ab07ec35f275f76d2d1c83327c48eedca8f2c143c8260e94e88a166c7db4bb","abstract_canon_sha256":"a632e2b0355406cda9c6b3bf42793c472e422d164019278a193ecf76af82506e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:08:57.495365Z","signature_b64":"hoVw2CzIp35ii/mOk91/HmuAu6D8e7InXdLP+VvHrjq5vMMaTs6i4GLjVB17peqKMQtYrxou9aDU+rYy87EqAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b4537c4f9b5f44771c95bc4fde9601198855fb927d0086718ea87b4890ea082b","last_reissued_at":"2026-05-18T03:08:57.494442Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:08:57.494442Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"SWE-Cycle: Benchmarking Code Agents across the Complete Issue Resolution Cycle","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Code agents show sharply lower success rates when handling complete issue resolution autonomously versus in isolated subtasks.","cross_cats":[],"primary_cat":"cs.SE","authors_text":"Hao Guan, Kangning Zhang, Lingyue Fu, Lin Qiu, Shao Zhang, Weinan Zhang, Weiwen Liu, Xuezhi Cao, Xunliang Cai, Yaoming Zhu, Yong Yu","submitted_at":"2026-05-13T08:05:16Z","abstract_excerpt":"As autonomous code agents move toward end-to-end software development, evaluating their practical autonomy becomes critical. Current benchmarks hide friction by testing agents in pre-configured environments, and their static evaluation pipelines frequently fail when parsing fully autonomous trajectories. We address these limitations with SWE-Cycle, a benchmark of 489 rigorously filtered instances. SWE-Cycle evaluates agents across three isolated tasks, including environment reconstruction, code implementation, and verification test generation, as well as an end-to-end FullCycle task that integ"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"The results reveal a sharp drop in solve rates when transitioning from isolated tasks to FullCycle execution, exposing critical bottlenecks in handling cross-phase dependencies and maintaining code quality.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The 489 rigorously filtered instances and the SWE-Judge evaluation accurately capture practical autonomy without introducing selection bias or verification errors that would change the observed performance drop.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SWE-Cycle benchmark shows sharp drops in code agent success rates from isolated tasks to full autonomous issue resolution, highlighting cross-phase dependency issues.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Code agents show sharply lower success rates when handling complete issue resolution autonomously versus in isolated subtasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"c5e3a9f10502354c2d86f7c689b5952b727394f972ea95112b069d02d33983bb"},"source":{"id":"2605.13139","kind":"arxiv","version":1},"verdict":{"id":"a718588a-517f-4647-8103-fb1943071fd3","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T18:32:49.817448Z","strongest_claim":"The results reveal a sharp drop in solve rates when transitioning from isolated tasks to FullCycle execution, exposing critical bottlenecks in handling cross-phase dependencies and maintaining code quality.","one_line_summary":"SWE-Cycle benchmark shows sharp drops in code agent success rates from isolated tasks to full autonomous issue resolution, highlighting cross-phase dependency issues.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The 489 rigorously filtered instances and the SWE-Judge evaluation accurately capture practical autonomy without introducing selection bias or verification errors that would change the observed performance drop.","pith_extraction_headline":"Code agents show sharply lower success rates when handling complete issue resolution autonomously versus in isolated subtasks."},"references":{"count":66,"sample":[{"doi":"","year":null,"title":"Claude 4.6 sonnet system card","work_id":"85bdec73-f606-453f-ac9a-34ba951a6b05","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"URL https://assets.anthropic.com/m/785e231869ea8b3b/original/ Claude-4-6-Sonnet-System-Card.pdf","work_id":"9dc8b2b7-21b7-4e27-ac6c-d6ac7d3ec38f","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Anthropic. Claude code, 2025. URLhttps://github.com/anthropics/claude-code","work_id":"a974e0ed-b8d8-4534-a3aa-34731e913fbf","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Introducing claude opus 4.5","work_id":"135f65e0-92b8-43bb-8f8e-a7cd633b7005","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Why Do Multi-Agent LLM Systems Fail?","work_id":"b186294a-cda7-4df0-9a28-27d379af92b2","ref_index":5,"cited_arxiv_id":"2503.13657","is_internal_anchor":true}],"resolved_work":66,"snapshot_sha256":"8b4734778106998e880b2fe3f427d0e9dea9e4ede5dd060260ff25b3bc86b55c","internal_anchors":12},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.13139","created_at":"2026-05-18T03:08:57.494594+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.13139v1","created_at":"2026-05-18T03:08:57.494594+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13139","created_at":"2026-05-18T03:08:57.494594+00:00"},{"alias_kind":"pith_short_12","alias_value":"WRJXYT43L5CH","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"WRJXYT43L5CHOHEV","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"WRJXYT43","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG","json":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG.json","graph_json":"https://pith.science/api/pith-number/WRJXYT43L5CHOHEVXRH55FQBDG/graph.json","events_json":"https://pith.science/api/pith-number/WRJXYT43L5CHOHEVXRH55FQBDG/events.json","paper":"https://pith.science/paper/WRJXYT43"},"agent_actions":{"view_html":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG","download_json":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG.json","view_paper":"https://pith.science/paper/WRJXYT43","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.13139&json=true","fetch_graph":"https://pith.science/api/pith-number/WRJXYT43L5CHOHEVXRH55FQBDG/graph.json","fetch_events":"https://pith.science/api/pith-number/WRJXYT43L5CHOHEVXRH55FQBDG/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG/action/storage_attestation","attest_author":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG/action/author_attestation","sign_citation":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG/action/citation_signature","submit_replication":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG/action/replication_record"}},"created_at":"2026-05-18T03:08:57.494594+00:00","updated_at":"2026-05-18T03:08:57.494594+00:00"}