{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:WRJXYT43L5CHOHEVXRH55FQBDG","short_pith_number":"pith:WRJXYT43","canonical_record":{"source":{"id":"2605.13139","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-13T08:05:16Z","cross_cats_sorted":[],"title_canon_sha256":"b6ab07ec35f275f76d2d1c83327c48eedca8f2c143c8260e94e88a166c7db4bb","abstract_canon_sha256":"a632e2b0355406cda9c6b3bf42793c472e422d164019278a193ecf76af82506e"},"schema_version":"1.0"},"canonical_sha256":"b4537c4f9b5f44771c95bc4fde9601198855fb927d0086718ea87b4890ea082b","source":{"kind":"arxiv","id":"2605.13139","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13139","created_at":"2026-05-18T03:08:57Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13139v1","created_at":"2026-05-18T03:08:57Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13139","created_at":"2026-05-18T03:08:57Z"},{"alias_kind":"pith_short_12","alias_value":"WRJXYT43L5CH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WRJXYT43L5CHOHEV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WRJXYT43","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:WRJXYT43L5CHOHEVXRH55FQBDG","target":"record","payload":{"canonical_record":{"source":{"id":"2605.13139","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-13T08:05:16Z","cross_cats_sorted":[],"title_canon_sha256":"b6ab07ec35f275f76d2d1c83327c48eedca8f2c143c8260e94e88a166c7db4bb","abstract_canon_sha256":"a632e2b0355406cda9c6b3bf42793c472e422d164019278a193ecf76af82506e"},"schema_version":"1.0"},"canonical_sha256":"b4537c4f9b5f44771c95bc4fde9601198855fb927d0086718ea87b4890ea082b","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:08:57.495365Z","signature_b64":"hoVw2CzIp35ii/mOk91/HmuAu6D8e7InXdLP+VvHrjq5vMMaTs6i4GLjVB17peqKMQtYrxou9aDU+rYy87EqAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b4537c4f9b5f44771c95bc4fde9601198855fb927d0086718ea87b4890ea082b","last_reissued_at":"2026-05-18T03:08:57.494442Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:08:57.494442Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.13139","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:08:57Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"HHeVUa3PyBQoLIGL9kJ2ZA7zNE6Q3EMfoCt2dl3fp1ofQbXCx8Ru3IJpxIcatwb7XHb2G0a4XHGkoptIGEp6Cw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T19:14:59.578113Z"},"content_sha256":"a3b71136db9bee8b5a5363da72ba4f1230c045a72bbb217ac1dc3a8a7dc439b6","schema_version":"1.0","event_id":"sha256:a3b71136db9bee8b5a5363da72ba4f1230c045a72bbb217ac1dc3a8a7dc439b6"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:WRJXYT43L5CHOHEVXRH55FQBDG","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"SWE-Cycle: Benchmarking Code Agents across the Complete Issue Resolution Cycle","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Code agents show sharply lower success rates when handling complete issue resolution autonomously versus in isolated subtasks.","cross_cats":[],"primary_cat":"cs.SE","authors_text":"Hao Guan, Kangning Zhang, Lingyue Fu, Lin Qiu, Shao Zhang, Weinan Zhang, Weiwen Liu, Xuezhi Cao, Xunliang Cai, Yaoming Zhu, Yong Yu","submitted_at":"2026-05-13T08:05:16Z","abstract_excerpt":"As autonomous code agents move toward end-to-end software development, evaluating their practical autonomy becomes critical. Current benchmarks hide friction by testing agents in pre-configured environments, and their static evaluation pipelines frequently fail when parsing fully autonomous trajectories. We address these limitations with SWE-Cycle, a benchmark of 489 rigorously filtered instances. SWE-Cycle evaluates agents across three isolated tasks, including environment reconstruction, code implementation, and verification test generation, as well as an end-to-end FullCycle task that integ"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"The results reveal a sharp drop in solve rates when transitioning from isolated tasks to FullCycle execution, exposing critical bottlenecks in handling cross-phase dependencies and maintaining code quality.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The 489 rigorously filtered instances and the SWE-Judge evaluation accurately capture practical autonomy without introducing selection bias or verification errors that would change the observed performance drop.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SWE-Cycle benchmark shows sharp drops in code agent success rates from isolated tasks to full autonomous issue resolution, highlighting cross-phase dependency issues.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Code agents show sharply lower success rates when handling complete issue resolution autonomously versus in isolated subtasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"c5e3a9f10502354c2d86f7c689b5952b727394f972ea95112b069d02d33983bb"},"source":{"id":"2605.13139","kind":"arxiv","version":1},"verdict":{"id":"a718588a-517f-4647-8103-fb1943071fd3","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T18:32:49.817448Z","strongest_claim":"The results reveal a sharp drop in solve rates when transitioning from isolated tasks to FullCycle execution, exposing critical bottlenecks in handling cross-phase dependencies and maintaining code quality.","one_line_summary":"SWE-Cycle benchmark shows sharp drops in code agent success rates from isolated tasks to full autonomous issue resolution, highlighting cross-phase dependency issues.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The 489 rigorously filtered instances and the SWE-Judge evaluation accurately capture practical autonomy without introducing selection bias or verification errors that would change the observed performance drop.","pith_extraction_headline":"Code agents show sharply lower success rates when handling complete issue resolution autonomously versus in isolated subtasks."},"references":{"count":66,"sample":[{"doi":"","year":null,"title":"Claude 4.6 sonnet system card","work_id":"85bdec73-f606-453f-ac9a-34ba951a6b05","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"URL https://assets.anthropic.com/m/785e231869ea8b3b/original/ Claude-4-6-Sonnet-System-Card.pdf","work_id":"9dc8b2b7-21b7-4e27-ac6c-d6ac7d3ec38f","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Anthropic. Claude code, 2025. URLhttps://github.com/anthropics/claude-code","work_id":"a974e0ed-b8d8-4534-a3aa-34731e913fbf","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Introducing claude opus 4.5","work_id":"135f65e0-92b8-43bb-8f8e-a7cd633b7005","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Why Do Multi-Agent LLM Systems Fail?","work_id":"b186294a-cda7-4df0-9a28-27d379af92b2","ref_index":5,"cited_arxiv_id":"2503.13657","is_internal_anchor":true}],"resolved_work":66,"snapshot_sha256":"8b4734778106998e880b2fe3f427d0e9dea9e4ede5dd060260ff25b3bc86b55c","internal_anchors":12},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"a718588a-517f-4647-8103-fb1943071fd3"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:08:57Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"VaTSKKbDjZVJ3pj0QBb2NvXaj9Lozo3iG8cEEYfEw7EWIqZ59EYsNz0PJeauHHHK/Wo3XRjQWA39Kohnq2rVCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T19:14:59.579107Z"},"content_sha256":"a30ee48c96bfd8da298205bd130ea8febc2b173fd401b3e2481baf2f219a9150","schema_version":"1.0","event_id":"sha256:a30ee48c96bfd8da298205bd130ea8febc2b173fd401b3e2481baf2f219a9150"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG/bundle.json","state_url":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/WRJXYT43L5CHOHEVXRH55FQBDG/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T19:14:59Z","links":{"resolver":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG","bundle":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG/bundle.json","state":"https://pith.science/pith/WRJXYT43L5CHOHEVXRH55FQBDG/state.json","well_known_bundle":"https://pith.science/.well-known/pith/WRJXYT43L5CHOHEVXRH55FQBDG/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:WRJXYT43L5CHOHEVXRH55FQBDG","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"a632e2b0355406cda9c6b3bf42793c472e422d164019278a193ecf76af82506e","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-13T08:05:16Z","title_canon_sha256":"b6ab07ec35f275f76d2d1c83327c48eedca8f2c143c8260e94e88a166c7db4bb"},"schema_version":"1.0","source":{"id":"2605.13139","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13139","created_at":"2026-05-18T03:08:57Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13139v1","created_at":"2026-05-18T03:08:57Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13139","created_at":"2026-05-18T03:08:57Z"},{"alias_kind":"pith_short_12","alias_value":"WRJXYT43L5CH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WRJXYT43L5CHOHEV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WRJXYT43","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:a30ee48c96bfd8da298205bd130ea8febc2b173fd401b3e2481baf2f219a9150","target":"graph","created_at":"2026-05-18T03:08:57Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"The results reveal a sharp drop in solve rates when transitioning from isolated tasks to FullCycle execution, exposing critical bottlenecks in handling cross-phase dependencies and maintaining code quality."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The 489 rigorously filtered instances and the SWE-Judge evaluation accurately capture practical autonomy without introducing selection bias or verification errors that would change the observed performance drop."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SWE-Cycle benchmark shows sharp drops in code agent success rates from isolated tasks to full autonomous issue resolution, highlighting cross-phase dependency issues."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Code agents show sharply lower success rates when handling complete issue resolution autonomously versus in isolated subtasks."}],"snapshot_sha256":"c5e3a9f10502354c2d86f7c689b5952b727394f972ea95112b069d02d33983bb"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"As autonomous code agents move toward end-to-end software development, evaluating their practical autonomy becomes critical. Current benchmarks hide friction by testing agents in pre-configured environments, and their static evaluation pipelines frequently fail when parsing fully autonomous trajectories. We address these limitations with SWE-Cycle, a benchmark of 489 rigorously filtered instances. SWE-Cycle evaluates agents across three isolated tasks, including environment reconstruction, code implementation, and verification test generation, as well as an end-to-end FullCycle task that integ","authors_text":"Hao Guan, Kangning Zhang, Lingyue Fu, Lin Qiu, Shao Zhang, Weinan Zhang, Weiwen Liu, Xuezhi Cao, Xunliang Cai, Yaoming Zhu, Yong Yu","cross_cats":[],"headline":"Code agents show sharply lower success rates when handling complete issue resolution autonomously versus in isolated subtasks.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-13T08:05:16Z","title":"SWE-Cycle: Benchmarking Code Agents across the Complete Issue Resolution Cycle"},"references":{"count":66,"internal_anchors":12,"resolved_work":66,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Claude 4.6 sonnet system card","work_id":"85bdec73-f606-453f-ac9a-34ba951a6b05","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"URL https://assets.anthropic.com/m/785e231869ea8b3b/original/ Claude-4-6-Sonnet-System-Card.pdf","work_id":"9dc8b2b7-21b7-4e27-ac6c-d6ac7d3ec38f","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Anthropic. Claude code, 2025. URLhttps://github.com/anthropics/claude-code","work_id":"a974e0ed-b8d8-4534-a3aa-34731e913fbf","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Introducing claude opus 4.5","work_id":"135f65e0-92b8-43bb-8f8e-a7cd633b7005","year":2025},{"cited_arxiv_id":"2503.13657","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Why Do Multi-Agent LLM Systems Fail?","work_id":"b186294a-cda7-4df0-9a28-27d379af92b2","year":2025}],"snapshot_sha256":"8b4734778106998e880b2fe3f427d0e9dea9e4ede5dd060260ff25b3bc86b55c"},"source":{"id":"2605.13139","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T18:32:49.817448Z","id":"a718588a-517f-4647-8103-fb1943071fd3","model_set":{"reader":"grok-4.3"},"one_line_summary":"SWE-Cycle benchmark shows sharp drops in code agent success rates from isolated tasks to full autonomous issue resolution, highlighting cross-phase dependency issues.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Code agents show sharply lower success rates when handling complete issue resolution autonomously versus in isolated subtasks.","strongest_claim":"The results reveal a sharp drop in solve rates when transitioning from isolated tasks to FullCycle execution, exposing critical bottlenecks in handling cross-phase dependencies and maintaining code quality.","weakest_assumption":"The 489 rigorously filtered instances and the SWE-Judge evaluation accurately capture practical autonomy without introducing selection bias or verification errors that would change the observed performance drop."}},"verdict_id":"a718588a-517f-4647-8103-fb1943071fd3"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a3b71136db9bee8b5a5363da72ba4f1230c045a72bbb217ac1dc3a8a7dc439b6","target":"record","created_at":"2026-05-18T03:08:57Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"a632e2b0355406cda9c6b3bf42793c472e422d164019278a193ecf76af82506e","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-13T08:05:16Z","title_canon_sha256":"b6ab07ec35f275f76d2d1c83327c48eedca8f2c143c8260e94e88a166c7db4bb"},"schema_version":"1.0","source":{"id":"2605.13139","kind":"arxiv","version":1}},"canonical_sha256":"b4537c4f9b5f44771c95bc4fde9601198855fb927d0086718ea87b4890ea082b","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b4537c4f9b5f44771c95bc4fde9601198855fb927d0086718ea87b4890ea082b","first_computed_at":"2026-05-18T03:08:57.494442Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:08:57.494442Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"hoVw2CzIp35ii/mOk91/HmuAu6D8e7InXdLP+VvHrjq5vMMaTs6i4GLjVB17peqKMQtYrxou9aDU+rYy87EqAA==","signature_status":"signed_v1","signed_at":"2026-05-18T03:08:57.495365Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13139","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a3b71136db9bee8b5a5363da72ba4f1230c045a72bbb217ac1dc3a8a7dc439b6","sha256:a30ee48c96bfd8da298205bd130ea8febc2b173fd401b3e2481baf2f219a9150"],"state_sha256":"041f46540d2b081cd6b6137a1d5b394cc2de22899a773f136c69d6642efd3f06"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"VBVFprEfedoGz+PtiRaWqjaTR9CCe5RLuzS6+md6Rli9XvQ4gXsb7hGHVYvYWoZYRnrkEwTTbARoaOp/md38CQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T19:14:59.582475Z","bundle_sha256":"a6d8fa36f5ab50928abc86a081515449495a5fc97b677a57b24c70170c558cb7"}}