{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:JDREUXQCV2PGUCRNUPK265FVIE","short_pith_number":"pith:JDREUXQC","schema_version":"1.0","canonical_sha256":"48e24a5e02ae9e6a0a2da3d5af74b54110d7f1277a7f1d57557ae2e3d6d0c232","source":{"kind":"arxiv","id":"2511.06090","version":3},"attestation_state":"computed","paper":{"title":"SWE-fficiency: Can Language Models Optimize Real-World Repositories on Real Workloads?","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.PF"],"primary_cat":"cs.SE","authors_text":"Amir Yazdanbakhsh, Enhui Li, Jeffrey Jian Ma, Kevin Swersky, Milad Hashemi, Ofir Press, Parthasarathy Ranganathan, Vijay Janapa Reddi","submitted_at":"2025-11-08T17:55:09Z","abstract_excerpt":"Optimizing the performance of large-scale software repositories demands expertise in code reasoning and software engineering (SWE) to reduce runtime while preserving program correctness. However, most benchmarks emphasize what to fix rather than how to fix code. We introduce SWE-fficiency, a benchmark for evaluating repository-level performance optimization on real workloads. Our suite contains 498 tasks across nine widely used data-science, machine-learning, and HPC repositories (e.g., numpy, pandas, scipy): given a complete codebase and a slow workload, an agent must investigate code semanti"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2511.06090","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2025-11-08T17:55:09Z","cross_cats_sorted":["cs.AI","cs.PF"],"title_canon_sha256":"01f9f4869890d9afa83e1a542b5c1158ef47c6ab8240b4ce80e20ce49efe2d3e","abstract_canon_sha256":"c4e99ec4bde3aec376c374f6e58d6deb0332d56b985b7f5b0c4be595221b2c31"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T01:17:28.779689Z","signature_b64":"p7n355JMXqTvkmKf9HJNu0t4TvA1BYNzm2xTKn1ggIvN+APgh777y5zsqY2LOxvUeUnlXglpjzta4mS2dl/tAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"48e24a5e02ae9e6a0a2da3d5af74b54110d7f1277a7f1d57557ae2e3d6d0c232","last_reissued_at":"2026-06-30T01:17:28.776680Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T01:17:28.776680Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"SWE-fficiency: Can Language Models Optimize Real-World Repositories on Real Workloads?","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.PF"],"primary_cat":"cs.SE","authors_text":"Amir Yazdanbakhsh, Enhui Li, Jeffrey Jian Ma, Kevin Swersky, Milad Hashemi, Ofir Press, Parthasarathy Ranganathan, Vijay Janapa Reddi","submitted_at":"2025-11-08T17:55:09Z","abstract_excerpt":"Optimizing the performance of large-scale software repositories demands expertise in code reasoning and software engineering (SWE) to reduce runtime while preserving program correctness. However, most benchmarks emphasize what to fix rather than how to fix code. We introduce SWE-fficiency, a benchmark for evaluating repository-level performance optimization on real workloads. Our suite contains 498 tasks across nine widely used data-science, machine-learning, and HPC repositories (e.g., numpy, pandas, scipy): given a complete codebase and a slow workload, an agent must investigate code semanti"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2511.06090","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2511.06090/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2511.06090","created_at":"2026-06-30T01:17:28.776825+00:00"},{"alias_kind":"arxiv_version","alias_value":"2511.06090v3","created_at":"2026-06-30T01:17:28.776825+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.06090","created_at":"2026-06-30T01:17:28.776825+00:00"},{"alias_kind":"pith_short_12","alias_value":"JDREUXQCV2PG","created_at":"2026-06-30T01:17:28.776825+00:00"},{"alias_kind":"pith_short_16","alias_value":"JDREUXQCV2PGUCRN","created_at":"2026-06-30T01:17:28.776825+00:00"},{"alias_kind":"pith_short_8","alias_value":"JDREUXQC","created_at":"2026-06-30T01:17:28.776825+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":4,"sample":[{"citing_arxiv_id":"2606.31767","citing_title":"JETO-Bench: A Reproducible Benchmark for Execution Time Improvement Patches in Java","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2605.28751","citing_title":"Extrapolative Weight Averaging Reveals Correctness-Efficiency Frontiers in Code RL","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10890","citing_title":"CppPerf: An Automated Pipeline and Dataset for Performance-Improving C++ Commits","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06068","citing_title":"VibeServe: Can AI Agents Build Bespoke LLM Serving Systems?","ref_index":48,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/JDREUXQCV2PGUCRNUPK265FVIE","json":"https://pith.science/pith/JDREUXQCV2PGUCRNUPK265FVIE.json","graph_json":"https://pith.science/api/pith-number/JDREUXQCV2PGUCRNUPK265FVIE/graph.json","events_json":"https://pith.science/api/pith-number/JDREUXQCV2PGUCRNUPK265FVIE/events.json","paper":"https://pith.science/paper/JDREUXQC"},"agent_actions":{"view_html":"https://pith.science/pith/JDREUXQCV2PGUCRNUPK265FVIE","download_json":"https://pith.science/pith/JDREUXQCV2PGUCRNUPK265FVIE.json","view_paper":"https://pith.science/paper/JDREUXQC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2511.06090&json=true","fetch_graph":"https://pith.science/api/pith-number/JDREUXQCV2PGUCRNUPK265FVIE/graph.json","fetch_events":"https://pith.science/api/pith-number/JDREUXQCV2PGUCRNUPK265FVIE/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/JDREUXQCV2PGUCRNUPK265FVIE/action/timestamp_anchor","attest_storage":"https://pith.science/pith/JDREUXQCV2PGUCRNUPK265FVIE/action/storage_attestation","attest_author":"https://pith.science/pith/JDREUXQCV2PGUCRNUPK265FVIE/action/author_attestation","sign_citation":"https://pith.science/pith/JDREUXQCV2PGUCRNUPK265FVIE/action/citation_signature","submit_replication":"https://pith.science/pith/JDREUXQCV2PGUCRNUPK265FVIE/action/replication_record"}},"created_at":"2026-06-30T01:17:28.776825+00:00","updated_at":"2026-06-30T01:17:28.776825+00:00"}