{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:LPM2GGKOMAOVP3AIND2NNJ45OD","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"808b6ee12521cfa65d940dbff573db470cef0b046828badf3561d86e29929d47","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2025-04-03T14:06:17Z","title_canon_sha256":"786fcc79ffc89a2a8b47161e0f7428763a97880976273b83de4674713cf59455"},"schema_version":"1.0","source":{"id":"2504.02605","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2504.02605","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2504.02605v1","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.02605","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"LPM2GGKOMAOV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"LPM2GGKOMAOVP3AI","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"LPM2GGKO","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:5d8c34c404d6e4d7ad079fb9824b30a63429a95b82d0efd85acffabb45de1eb5","target":"graph","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"we introduce a multilingual issue-resolving benchmark, called Multi-SWE-bench, covering Java, TypeScript, JavaScript, Go, Rust, C, and C++. It includes a total of 1,632 high-quality instances, which were carefully annotated from 2,456 candidates by 68 expert annotators, ensuring that the benchmark can provide an accurate and reliable evaluation."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The 68 expert annotators' curation from 2,456 candidates to 1,632 instances produces an unbiased, high-quality, and representative set that accurately reflects real-world issue-resolving difficulty across languages."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Multi-SWE-bench provides 1,632 high-quality issue-resolving instances across Java, TypeScript, JavaScript, Go, Rust, C, and C++ for evaluating LLMs on codebase modifications."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Multi-SWE-bench supplies 1632 expert-curated issue-resolving tasks across seven languages to test LLMs beyond Python-only benchmarks."}],"snapshot_sha256":"697bee0fa03bded8910dcaed581e4bf94113ae96302e6e955140d5431d109140"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"The task of issue resolving is to modify a codebase to generate a patch that addresses a given issue. However, existing benchmarks, such as SWE-bench, focus almost exclusively on Python, making them insufficient for evaluating Large Language Models (LLMs) across diverse software ecosystems. To address this, we introduce a multilingual issue-resolving benchmark, called Multi-SWE-bench, covering Java, TypeScript, JavaScript, Go, Rust, C, and C++. It includes a total of 1,632 high-quality instances, which were carefully annotated from 2,456 candidates by 68 expert annotators, ensuring that the be","authors_text":"Aoyan Li, Daoguang Zan, Hanwu Chen, Jing Su, Kai Shen, Liangqiang Chen, Liang Xiang, Linhao Zhang, Lu Chen, Qi Liu, Rui Long, Shulin Xin, Siyao Liu, Tianyu Liu, Wei Liu, Xiaojian Zhong, Yongsheng Xiao, Yuyu Zhang, Zhirong Huang","cross_cats":["cs.AI","cs.CL"],"headline":"Multi-SWE-bench supplies 1632 expert-curated issue-resolving tasks across seven languages to test LLMs beyond Python-only benchmarks.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2025-04-03T14:06:17Z","title":"Multi-SWE-bench: A Multilingual Benchmark for Issue Resolving"},"references":{"count":23,"internal_anchors":7,"resolved_work":23,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"R. Abreu, P . Zoeteweij, and A. J. Van Gemund. On the accuracy of spectrum-based fault localization. In Testing: Academic and industrial conference practice and research techniques- MUTATION (TAICP AR","work_id":"7898c2e3-7589-4501-9f75-48e2a670f389","year":2007},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"M. Allamanis and C. Sutton. Mining source code repositories at massive scale using language modeling. In 2013 10th working conference on mining software repositories (MSR), pages 207–216. IEEE,","work_id":"58a4b45d-f7e2-4aaf-99c4-5a9ed742cec7","year":2013},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Multi-lingual evaluation of code generation models","work_id":"1283984e-956f-4584-8b1a-ae121c31626c","year":null},{"cited_arxiv_id":"2108.07732","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","year":2025},{"cited_arxiv_id":"2107.03374","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","year":null}],"snapshot_sha256":"a193678b530303929f0674249e7897ebf393c36c0349496b37a4bb5af76243a7"},"source":{"id":"2504.02605","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-16T06:45:01.500998Z","id":"703ddaeb-a5e4-4ef7-8a3d-01251ec238e1","model_set":{"reader":"grok-4.3"},"one_line_summary":"Multi-SWE-bench provides 1,632 high-quality issue-resolving instances across Java, TypeScript, JavaScript, Go, Rust, C, and C++ for evaluating LLMs on codebase modifications.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Multi-SWE-bench supplies 1632 expert-curated issue-resolving tasks across seven languages to test LLMs beyond Python-only benchmarks.","strongest_claim":"we introduce a multilingual issue-resolving benchmark, called Multi-SWE-bench, covering Java, TypeScript, JavaScript, Go, Rust, C, and C++. It includes a total of 1,632 high-quality instances, which were carefully annotated from 2,456 candidates by 68 expert annotators, ensuring that the benchmark can provide an accurate and reliable evaluation.","weakest_assumption":"The 68 expert annotators' curation from 2,456 candidates to 1,632 instances produces an unbiased, high-quality, and representative set that accurately reflects real-world issue-resolving difficulty across languages."}},"verdict_id":"703ddaeb-a5e4-4ef7-8a3d-01251ec238e1"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:ec620040c637663457bc511b31b521ceac4d8771243c201924e2c24a90beaf23","target":"record","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"808b6ee12521cfa65d940dbff573db470cef0b046828badf3561d86e29929d47","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2025-04-03T14:06:17Z","title_canon_sha256":"786fcc79ffc89a2a8b47161e0f7428763a97880976273b83de4674713cf59455"},"schema_version":"1.0","source":{"id":"2504.02605","kind":"arxiv","version":1}},"canonical_sha256":"5bd9a3194e601d57ec0868f4d6a79d70dc64d6e781232b82ed790948529fe591","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"5bd9a3194e601d57ec0868f4d6a79d70dc64d6e781232b82ed790948529fe591","first_computed_at":"2026-05-17T23:38:48.785976Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:48.785976Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"/NRAVBxxuWJfjgadU+gf6nIi1G5l2srmFJ/YZFibfojIQ3DtTh1HxcvpRHNgss/n+tmpRUdrEtwLt0qJcjH0Cw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:48.786621Z","signed_message":"canonical_sha256_bytes"},"source_id":"2504.02605","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:ec620040c637663457bc511b31b521ceac4d8771243c201924e2c24a90beaf23","sha256:5d8c34c404d6e4d7ad079fb9824b30a63429a95b82d0efd85acffabb45de1eb5"],"state_sha256":"53cdf2f4f0cf252bb0b50b992a00e3ad8c8746a819d13158e3cf866ae7c3152b"}