{"paper":{"title":"EnergyAgentBench: Benchmarking LLM Agents on Live Energy Infrastructure Data","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"EnergyAgentBench tests LLM agents on live electricity market data to select optimal sites for AI datacenters.","cross_cats":[],"primary_cat":"econ.EM","authors_text":"Eliseo Curcio","submitted_at":"2026-05-13T18:03:51Z","abstract_excerpt":"Selecting the right electricity market region for a hyperscale AI datacenter requires reasoning across live electricity prices, grid carbon intensity, technology cost trajectories, and causal grid dynamics -- a multi-step, multi-source analytical task that static knowledge benchmarks cannot evaluate. We introduce EnergyAgentBench, the first agentic benchmark grounded in live electricity market data for this problem class. The benchmark comprises 70 task variants across five families: datacenter siting under cost-carbon trade-offs (F1), long-horizon portfolio siting (F1-LH), lifetime LCOE ranki"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We introduce EnergyAgentBench, the first agentic benchmark grounded in live electricity market data for this problem class. Claude Sonnet 4.6 achieves the highest overall score (0.900) at one-quarter the cost of Claude Opus 4.7 (0.889).","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"Ground truth derived from trained XGBoost cost-surface models (R^2 0.967--0.995) and the NREL Annual Technology Baseline 2024 accurately captures real-world cost-carbon dynamics and causal grid relationships for the 70 task variants.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"EnergyAgentBench is a new benchmark with 70 task variants that evaluates LLM agents on live energy data for datacenter siting, long-horizon optimization, and causal grid diagnosis.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"EnergyAgentBench tests LLM agents on live electricity market data to select optimal sites for AI datacenters.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"1b2668f6e446945c119bf5a6a16de21b480877933c1d19baef78257f1f7f58ba"},"source":{"id":"2605.15230","kind":"arxiv","version":1},"verdict":{"id":"91d477f2-81ce-4a5f-8d29-007705faa8fb","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T17:38:23.255429Z","strongest_claim":"We introduce EnergyAgentBench, the first agentic benchmark grounded in live electricity market data for this problem class. Claude Sonnet 4.6 achieves the highest overall score (0.900) at one-quarter the cost of Claude Opus 4.7 (0.889).","one_line_summary":"EnergyAgentBench is a new benchmark with 70 task variants that evaluates LLM agents on live energy data for datacenter siting, long-horizon optimization, and causal grid diagnosis.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"Ground truth derived from trained XGBoost cost-surface models (R^2 0.967--0.995) and the NREL Annual Technology Baseline 2024 accurately captures real-world cost-carbon dynamics and causal grid relationships for the 70 task variants.","pith_extraction_headline":"EnergyAgentBench tests LLM agents on live electricity market data to select optimal sites for AI datacenters."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.15230/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"claim_evidence","ran_at":"2026-05-19T18:01:56.045896Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_title_agreement","ran_at":"2026-05-19T18:01:18.609127Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T17:50:41.149083Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T13:33:22.830144Z","status":"skipped","version":"1.0.0","findings_count":0}],"snapshot_sha256":"adac65af59bd00eb9b25366268e755cfc60ddee03163fd4f217d9461a70896fb"},"references":{"count":37,"sample":[{"doi":"","year":2025,"title":"International Energy Agency. Energy and AI. IEA, Paris, 2025. Available: https://www.iea.org/reports/energy-and-ai","work_id":"99bfa13c-59c5-49e2-a7ab-a64ca593dd58","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Energy and AI: Executive Summary","work_id":"ef1539f2-cbc1-4f25-bfe0-1a097e0ea2a1","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Data Centre Electricity Use Surged in 2025","work_id":"e1147942-c384-4eab-bc48-64cf14c04413","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"How Much Electricity Does a Data Center Use? Complete 2025 Analysis","work_id":"6c56d2fb-ce56-4cb3-9e02-23da9927317c","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2026,"title":"Curcio, Curcio, Eliseo, Risk-Aware AI-Driven Design Optimization of Grid-Connected Hydrogen Systems Under Stochastic Operating Conditions (March 23, 2026)","work_id":"3f852892-2316-46f9-8f0a-003b76f0dd9d","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":37,"snapshot_sha256":"19b25c8904dfbb90e180f87317b0efbe293ab7e07dde7d4ef99b5136639b0bb3","internal_anchors":13},"formal_canon":{"evidence_count":2,"snapshot_sha256":"3ca9d3b7d5a45e6109f804023a7a33b526bc103d79b6622b8ba194b2727f7a84"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"}