{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:H6UDO7IPENMTQKXAXTNMJ33II6","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"163597358c8bb8172230ee1c963c10c414ddaed2eb7ac29ae3f07ca21a601e21","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-17T16:15:56Z","title_canon_sha256":"8915d7c6cbf7d59a1304c8146e19184686d39e5b04b1747292b3026376a3e130"},"schema_version":"1.0","source":{"id":"2605.17526","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.17526","created_at":"2026-05-20T00:04:44Z"},{"alias_kind":"arxiv_version","alias_value":"2605.17526v1","created_at":"2026-05-20T00:04:44Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.17526","created_at":"2026-05-20T00:04:44Z"},{"alias_kind":"pith_short_12","alias_value":"H6UDO7IPENMT","created_at":"2026-05-20T00:04:44Z"},{"alias_kind":"pith_short_16","alias_value":"H6UDO7IPENMTQKXA","created_at":"2026-05-20T00:04:44Z"},{"alias_kind":"pith_short_8","alias_value":"H6UDO7IP","created_at":"2026-05-20T00:04:44Z"}],"graph_snapshots":[{"event_id":"sha256:32498f16ae249c27368e9d5201a0b96f015d2e7410ea475baabbafcad1fba57a","target":"graph","created_at":"2026-05-20T00:04:44Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Over 95% of task failures occur before agents even reach deep business logic, with models often falling victim to overconfidence and prematurely halting during foundational system setup, or getting trapped in ineffective debugging loops."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The 30 tasks and 5,370 validation nodes sufficiently capture the heterogeneity, coupling, and long-horizon constraints of real enterprise SaaS systems without introducing artificial simplifications that favor or penalize particular agent behaviors."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SaaSBench introduces a heterogeneous benchmark for enterprise SaaS engineering and shows that state-of-the-art coding agents fail over 95% of the time before reaching deep business logic due to setup and integration problems."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Coding agents fail over 95% of enterprise SaaS tasks before reaching business logic."}],"snapshot_sha256":"51b01b058f2d465606844c407afe609e13900ca6a21cdd65315e6289261a6235"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"faff2925ee4128d9cdae5059c729e4e38811b032bc4a56e9e84034a5e48e78cb"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T23:01:19.506064Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T22:51:22.131783Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-19T21:33:23.621790Z","status":"skipped","version":"1.0.0"},{"findings_count":0,"name":"claim_evidence","ran_at":"2026-05-19T21:21:57.561524Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2605.17526/integrity.json","findings":[],"snapshot_sha256":"89b3f057ed720ead8dd2864089a002bd9629cc9e86f7c7f4b306a85321294d27","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"As autonomous coding agents become capable of handling increasingly long-horizon tasks, they have gradually demonstrated the potential to complete end-to-end software development. Although existing benchmarks have recently evolved from localized code editing to from-scratch project generation, they remain confined to structurally simplified, single-stack applications. Consequently, they fail to capture the heterogeneous environments, full-stack orchestration, and system-level complexity of real enterprise Software as a Service (SaaS) systems, leaving a critical gap in assessing agents under re","authors_text":"Feng Zhao, Kou Shi, Lin Chen, Qingnan Ren, Qisheng Su, Shiting Huang, Shun Zou, Xiangxiang Chu, Yiming Zhao, Yong Wang, Yu Zeng, Zehui Chen, Zhen Fang, Ziao Zhang","cross_cats":["cs.AI"],"headline":"Coding agents fail over 95% of enterprise SaaS tasks before reaching business logic.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-17T16:15:56Z","title":"SaaSBench: Exploring the Boundaries of Coding Agents in Long-Horizon Enterprise SaaS Engineering"},"references":{"count":56,"internal_anchors":8,"resolved_work":56,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Claude code: Ai-powered coding assistant, 2024","work_id":"64af9f1e-212a-4edf-a08b-0c18035f0fe3","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"System card: Claude opus 4 & claude sonnet 4, 2025","work_id":"c0fcd08e-8b63-4d56-aeea-810cc7b36af4","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"System card: Claude sonnet 4.5, 2025","work_id":"90fef2d7-a7a1-441b-ab30-3274e4d8c007","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Introducing Claude Opus 4.7, 2026","work_id":"b4d51065-eb29-4e19-bed5-ca92cc03e244","year":2026},{"cited_arxiv_id":"2108.07732","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","year":2021}],"snapshot_sha256":"37f950e94af4a8729b958df5904e9dfdb5e30163ebe412794cbc3541ffbebae8"},"source":{"id":"2605.17526","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-19T22:39:51.142699Z","id":"d2e76412-658b-44e8-83e1-2678addc70b4","model_set":{"reader":"grok-4.3"},"one_line_summary":"SaaSBench introduces a heterogeneous benchmark for enterprise SaaS engineering and shows that state-of-the-art coding agents fail over 95% of the time before reaching deep business logic due to setup and integration problems.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Coding agents fail over 95% of enterprise SaaS tasks before reaching business logic.","strongest_claim":"Over 95% of task failures occur before agents even reach deep business logic, with models often falling victim to overconfidence and prematurely halting during foundational system setup, or getting trapped in ineffective debugging loops.","weakest_assumption":"The 30 tasks and 5,370 validation nodes sufficiently capture the heterogeneity, coupling, and long-horizon constraints of real enterprise SaaS systems without introducing artificial simplifications that favor or penalize particular agent behaviors."}},"verdict_id":"d2e76412-658b-44e8-83e1-2678addc70b4"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:6bf066ad5a319519cc65ababa1a78fa035f302154ff73ff88df5f71f8862c2ca","target":"record","created_at":"2026-05-20T00:04:44Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"163597358c8bb8172230ee1c963c10c414ddaed2eb7ac29ae3f07ca21a601e21","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-17T16:15:56Z","title_canon_sha256":"8915d7c6cbf7d59a1304c8146e19184686d39e5b04b1747292b3026376a3e130"},"schema_version":"1.0","source":{"id":"2605.17526","kind":"arxiv","version":1}},"canonical_sha256":"3fa8377d0f2359382ae0bcdac4ef684789c1edcd81da74c9e5535912cce13f28","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"3fa8377d0f2359382ae0bcdac4ef684789c1edcd81da74c9e5535912cce13f28","first_computed_at":"2026-05-20T00:04:44.046775Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:04:44.046775Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"aAwtcNfMrZrhquQieGXejsbMoeOx2rO29c0acmsVWx5fL6HdCgcJJX7TNtw5STK2sxPf51yxfX1EruvfV/w5Ag==","signature_status":"signed_v1","signed_at":"2026-05-20T00:04:44.047635Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.17526","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:6bf066ad5a319519cc65ababa1a78fa035f302154ff73ff88df5f71f8862c2ca","sha256:32498f16ae249c27368e9d5201a0b96f015d2e7410ea475baabbafcad1fba57a"],"state_sha256":"fc3b87489735cfbfbdd46b12d6087ce1969e2938322da61605bd7ad33ab137e7"}