{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:WQ66NAS7QRJQWP4HPYLRQ2N4ST","short_pith_number":"pith:WQ66NAS7","canonical_record":{"source":{"id":"2605.15188","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T17:59:28Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"131de9b90c4210166213f7230b50e3513bf7fc6742b5a6d98d95edbdd3897002","abstract_canon_sha256":"f79f3e8bfe083d3301c4d1dae9a620b2c49a6264f13323f074fd97ad4e825d76"},"schema_version":"1.0"},"canonical_sha256":"b43de6825f84530b3f877e171869bc94eab8d04bd392bd457dcb524b154deb46","source":{"kind":"arxiv","id":"2605.15188","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15188","created_at":"2026-05-17T21:18:32Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15188v1","created_at":"2026-05-17T21:18:32Z"},{"alias_kind":"pith_short_12","alias_value":"WQ66NAS7QRJQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WQ66NAS7QRJQWP4H","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WQ66NAS7","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:WQ66NAS7QRJQWP4HPYLRQ2N4ST","target":"record","payload":{"canonical_record":{"source":{"id":"2605.15188","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T17:59:28Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"131de9b90c4210166213f7230b50e3513bf7fc6742b5a6d98d95edbdd3897002","abstract_canon_sha256":"f79f3e8bfe083d3301c4d1dae9a620b2c49a6264f13323f074fd97ad4e825d76"},"schema_version":"1.0"},"canonical_sha256":"b43de6825f84530b3f877e171869bc94eab8d04bd392bd457dcb524b154deb46","receipt":{"kind":"pith_receipt","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.2","canonical_sha256":"b43de6825f84530b3f877e171869bc94eab8d04bd392bd457dcb524b154deb46","last_reissued_at":"2026-05-17T21:57:18.480573Z","signature_status":"unsigned_v0","first_computed_at":"2026-05-17T21:40:25.093331Z"},"source_kind":"arxiv","source_id":"2605.15188","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T21:18:32Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"hN3GfPCYG2a7k+B6FYwHYrj+tTy3VtFlRUeQ1U0vgonKTDGUUmo1JkWcgqhn7gwc58tbrQSd6aM/QH3xgnlfDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-20T11:16:51.680937Z"},"content_sha256":"4c00654f27a51e09a52cb90daf907df5501266081dfe481c04e4edc18c536d1f","schema_version":"1.0","event_id":"sha256:4c00654f27a51e09a52cb90daf907df5501266081dfe481c04e4edc18c536d1f"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:WQ66NAS7QRJQWP4HPYLRQ2N4ST","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"FutureSim: Replaying World Events to Evaluate Adaptive Agents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"FutureSim evaluates AI agents by replaying real historical events in order and shows even the best achieve only 25 percent accuracy on future predictions.","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Ameya Prabhu, Arvindh Arun, Jonas Geiping, Maksym Andriushchenko, Moritz Hardt, Nikhil Chandak, Shashwat Goel, Steffen Staab","submitted_at":"2026-05-14T17:59:28Z","abstract_excerpt":"AI agents are being increasingly deployed in dynamic, open-ended environments that require adapting to new information as it arrives. To efficiently measure this capability for realistic use-cases, we propose building grounded simulations that replay real-world events in the order they occurred. We build FutureSim, where agents forecast world events beyond their knowledge cutoff while interacting with a chronological replay of the world: real news articles arriving and questions resolving over the simulated period. We evaluate frontier agents in their native harness, testing their ability to p"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"FutureSim reveals a clear separation in their capabilities, with the best agent's accuracy being 25%, and many having worse Brier skill score than making no prediction at all.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That replaying real historical events chronologically without future knowledge leakage accurately measures an agent's adaptive capabilities in open-ended real-world settings.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"FutureSim is a benchmark that replays real news from January to March 2026 for AI agents to forecast events, with top accuracy at 25% and some agents worse than no-prediction baselines on Brier skill score.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"FutureSim evaluates AI agents by replaying real historical events in order and shows even the best achieve only 25 percent accuracy on future predictions.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"162a363276cac9ee69424ab8967097dedc85f9a09d938c5ff0d36ca3518e1ce7"},"source":{"id":"2605.15188","kind":"arxiv","version":1},"verdict":{"id":"ee187d39-2dbe-4949-9514-d3edbd034f12","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T03:08:02.403868Z","strongest_claim":"FutureSim reveals a clear separation in their capabilities, with the best agent's accuracy being 25%, and many having worse Brier skill score than making no prediction at all.","one_line_summary":"FutureSim is a benchmark that replays real news from January to March 2026 for AI agents to forecast events, with top accuracy at 25% and some agents worse than no-prediction baselines on Brier skill score.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That replaying real historical events chronologically without future knowledge leakage accurately measures an agent's adaptive capabilities in open-ended real-world settings.","pith_extraction_headline":"FutureSim evaluates AI agents by replaying real historical events in order and shows even the best achieve only 25 percent accuracy on future predictions."},"references":{"count":25,"sample":[{"doi":"10.5281/zenodo.1207631","year":2018,"title":"World models","work_id":"74007479-6f51-4839-ae30-4d6122d21c36","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2026,"title":"Lost in simulation: Llm-simulated users are unreliable proxies for human users in agentic evaluations","work_id":"1e07215b-a63b-418a-8387-6f3ccef28361","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"just give the model shell and tool access","work_id":"d4881466-cfe9-4f1b-95d7-3b358380ab45","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Context consumption feedback:After each tool call, the agent receives feedback about remaining context budget and approximate context occupancy. This is useful because the task spans thousands of turn","work_id":"c90e391d-d088-4e4f-880f-346a36ccc7cb","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"The goal is to make memory writing and retrieval deliberate actions rather than accidental byproducts of shell usage","work_id":"335abbb3-e733-4812-972b-67211b25b759","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":25,"snapshot_sha256":"98adbddef2b0d635a05aa2a0371620041f76819bc33f8144f0c8c6d153cc870f","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"ee187d39-2dbe-4949-9514-d3edbd034f12"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T21:57:18Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"71hszxt/t8bcV7nOATEwTVLWE6CAftqAsQuHK9Rg5iILMyfP2tJY8CQEQftvX8zBK+6t6VuWrpjk5uLmPUJADA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-20T11:16:51.681480Z"},"content_sha256":"8fb291469dd5e5b57ff8bd31e340d4efb6d813b50f07afc694e4523ef7004bb5","schema_version":"1.0","event_id":"sha256:8fb291469dd5e5b57ff8bd31e340d4efb6d813b50f07afc694e4523ef7004bb5"},{"event_type":"integrity_finding","subject_pith_number":"pith:2026:WQ66NAS7QRJQWP4HPYLRQ2N4ST","target":"integrity","payload":{"note":"URL 'https://openreview' returned status transport error (transport error: [Errno -3] Temporary failure in name resolution) at last check.","snippet":null,"arxiv_id":"2605.15188","detector":"external_links","evidence":{"url":"https://openreview","final_url":null,"host_kind":"website","status_code":0,"status_text":"transport error: [Errno -3] Temporary failure in name resolution","verdict_class":"incontrovertible","checked_at_unix":1779190316.9025543},"severity":"advisory","ref_index":null,"audited_at":"2026-05-19T11:31:58.948976Z","event_type":"pith.integrity.v1","detected_doi":null,"detector_url":"https://pith.science/pith-integrity-protocol#external_links","external_url":"https://openreview","finding_type":"dead_url","evidence_hash":"9fd7c71e8e9f15a300f4980a8a7530b1d1087994347b34df41405a3075989771","paper_version":1,"verdict_class":"incontrovertible","resolved_title":null,"detector_version":"1.0.0","detected_arxiv_id":null,"integrity_event_id":1057,"payload_sha256":"27e45336fa2af12774f2f8480e0f98f9f1052504761bd342659261ff3bd182f3","signature_b64":"TOvQTF+iz9p9kv/HALKPQ+1VLGIMXL+DCIkoRZ8Gomh3qxMatt2sAjmzquR2zfV+GX4d2UjSBN3+AE9HLdXlCQ==","signing_key_id":"pith-v1-2026-05"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-19T11:32:01Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"VRvEScltVmpzsowbbUObCvbERSScVuM/7+O3XaU5oPp33c8l3IRlmpdnmKeCbc52aTs2tmINblQDw/p967EOBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-20T11:16:51.682355Z"},"content_sha256":"be501cef01e29df3567b38a265a509161d45d438054ee5d4be07e5bfbb681825","schema_version":"1.0","event_id":"sha256:be501cef01e29df3567b38a265a509161d45d438054ee5d4be07e5bfbb681825"},{"event_type":"integrity_finding","subject_pith_number":"pith:2026:WQ66NAS7QRJQWP4HPYLRQ2N4ST","target":"integrity","payload":{"note":"URL 'https://arxiv.org/abs/' returned status 404 (Not Found) at last check.","snippet":null,"arxiv_id":"2605.15188","detector":"external_links","evidence":{"url":"https://arxiv.org/abs/","final_url":"https://arxiv.org/abs/","host_kind":"arxiv","status_code":404,"status_text":"Not Found","verdict_class":"incontrovertible","checked_at_unix":1779190314.7295334},"severity":"advisory","ref_index":null,"audited_at":"2026-05-19T11:31:58.948976Z","event_type":"pith.integrity.v1","detected_doi":null,"detector_url":"https://pith.science/pith-integrity-protocol#external_links","external_url":"https://arxiv.org/abs/","finding_type":"dead_url","evidence_hash":"631346952efbcb0ff162cea843b91430b0f5bcf8b495b2e39cf8834ff2b8886a","paper_version":1,"verdict_class":"incontrovertible","resolved_title":null,"detector_version":"1.0.0","detected_arxiv_id":null,"integrity_event_id":1056,"payload_sha256":"5a9a3ceea6b749e2af90c89436dbd592c314f281f9ee8e4083ddab8de7e37365","signature_b64":"SO3Xm91Y3ElfcgmSno62LE74eFIZezXEpOFKUUe7X4uwXgFeG7J22v1O6fse8suII2W1VdWaZjqFTj7E73TgDw==","signing_key_id":"pith-v1-2026-05"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-19T11:32:01Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"WPoP0u+w4xTEvVtcAfuQPp8t3GxJm3zRX/8wgxDfYj7s/MUb0/2pk0i2G90qEg/26MNL7ZkqfSho3gR0NPctAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-20T11:16:51.682641Z"},"content_sha256":"9376585e53be99e7ad9bfb181faccef8df6751b9474f5e08c3d84889c4904a10","schema_version":"1.0","event_id":"sha256:9376585e53be99e7ad9bfb181faccef8df6751b9474f5e08c3d84889c4904a10"},{"event_type":"integrity_finding","subject_pith_number":"pith:2026:WQ66NAS7QRJQWP4HPYLRQ2N4ST","target":"integrity","payload":{"note":"URL 'https://arxiv' returned status transport error (transport error: [Errno -3] Temporary failure in name resolution) at last check.","snippet":null,"arxiv_id":"2605.15188","detector":"external_links","evidence":{"url":"https://arxiv","final_url":null,"host_kind":"website","status_code":0,"status_text":"transport error: [Errno -3] Temporary failure in name resolution","verdict_class":"incontrovertible","checked_at_unix":1779190314.4808075},"severity":"advisory","ref_index":null,"audited_at":"2026-05-19T11:31:58.948976Z","event_type":"pith.integrity.v1","detected_doi":null,"detector_url":"https://pith.science/pith-integrity-protocol#external_links","external_url":"https://arxiv","finding_type":"dead_url","evidence_hash":"6977c23f15709e96c1231ac3b40fd4727b946722cf8fa9639b7735fe5ee3f06b","paper_version":1,"verdict_class":"incontrovertible","resolved_title":null,"detector_version":"1.0.0","detected_arxiv_id":null,"integrity_event_id":1055,"payload_sha256":"302687e08f4f9aa14da089228504427c019bb4840f9e0c3c5c6b12fd5d59d595","signature_b64":"z/0CJwYuUAZDKrkCSJpr8KfWe54v1odnlohnZrQ0wUm0k6S4HjBcbkajMa7n+dD76iHLukTeTTDOIpn8gpqTAA==","signing_key_id":"pith-v1-2026-05"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-19T11:32:01Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"xcXyFMVNSxE2edFpwepcT6+eWaBQEulApsASDXeysAxXBYy3wYsy+pp8YhslXaGpHie2HrkRH4tLYKrFT6BKDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-20T11:16:51.682920Z"},"content_sha256":"c95400052c9be07bd32b18681e6a3ba5f8b6d386d9aad63ffa1dfb7945be25b6","schema_version":"1.0","event_id":"sha256:c95400052c9be07bd32b18681e6a3ba5f8b6d386d9aad63ffa1dfb7945be25b6"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/WQ66NAS7QRJQWP4HPYLRQ2N4ST/bundle.json","state_url":"https://pith.science/pith/WQ66NAS7QRJQWP4HPYLRQ2N4ST/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/WQ66NAS7QRJQWP4HPYLRQ2N4ST/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-20T11:16:51Z","links":{"resolver":"https://pith.science/pith/WQ66NAS7QRJQWP4HPYLRQ2N4ST","bundle":"https://pith.science/pith/WQ66NAS7QRJQWP4HPYLRQ2N4ST/bundle.json","state":"https://pith.science/pith/WQ66NAS7QRJQWP4HPYLRQ2N4ST/state.json","well_known_bundle":"https://pith.science/.well-known/pith/WQ66NAS7QRJQWP4HPYLRQ2N4ST/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:WQ66NAS7QRJQWP4HPYLRQ2N4ST","merge_version":"pith-open-graph-merge-v1","event_count":5,"valid_event_count":5,"invalid_event_count":0,"equivocation_count":1,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f79f3e8bfe083d3301c4d1dae9a620b2c49a6264f13323f074fd97ad4e825d76","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T17:59:28Z","title_canon_sha256":"131de9b90c4210166213f7230b50e3513bf7fc6742b5a6d98d95edbdd3897002"},"schema_version":"1.0","source":{"id":"2605.15188","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15188","created_at":"2026-05-17T21:18:32Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15188v1","created_at":"2026-05-17T21:18:32Z"},{"alias_kind":"pith_short_12","alias_value":"WQ66NAS7QRJQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WQ66NAS7QRJQWP4H","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WQ66NAS7","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8fb291469dd5e5b57ff8bd31e340d4efb6d813b50f07afc694e4523ef7004bb5","target":"graph","created_at":"2026-05-17T21:57:18Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"FutureSim reveals a clear separation in their capabilities, with the best agent's accuracy being 25%, and many having worse Brier skill score than making no prediction at all."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That replaying real historical events chronologically without future knowledge leakage accurately measures an agent's adaptive capabilities in open-ended real-world settings."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"FutureSim is a benchmark that replays real news from January to March 2026 for AI agents to forecast events, with top accuracy at 25% and some agents worse than no-prediction baselines on Brier skill score."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"FutureSim evaluates AI agents by replaying real historical events in order and shows even the best achieve only 25 percent accuracy on future predictions."}],"snapshot_sha256":"162a363276cac9ee69424ab8967097dedc85f9a09d938c5ff0d36ca3518e1ce7"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"AI agents are being increasingly deployed in dynamic, open-ended environments that require adapting to new information as it arrives. To efficiently measure this capability for realistic use-cases, we propose building grounded simulations that replay real-world events in the order they occurred. We build FutureSim, where agents forecast world events beyond their knowledge cutoff while interacting with a chronological replay of the world: real news articles arriving and questions resolving over the simulated period. We evaluate frontier agents in their native harness, testing their ability to p","authors_text":"Ameya Prabhu, Arvindh Arun, Jonas Geiping, Maksym Andriushchenko, Moritz Hardt, Nikhil Chandak, Shashwat Goel, Steffen Staab","cross_cats":["cs.AI","cs.CL"],"headline":"FutureSim evaluates AI agents by replaying real historical events in order and shows even the best achieve only 25 percent accuracy on future predictions.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T17:59:28Z","title":"FutureSim: Replaying World Events to Evaluate Adaptive Agents"},"references":{"count":25,"internal_anchors":0,"resolved_work":25,"sample":[{"cited_arxiv_id":"","doi":"10.5281/zenodo.1207631","is_internal_anchor":false,"ref_index":1,"title":"World models","work_id":"74007479-6f51-4839-ae30-4d6122d21c36","year":2018},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Lost in simulation: Llm-simulated users are unreliable proxies for human users in agentic evaluations","work_id":"1e07215b-a63b-418a-8387-6f3ccef28361","year":2026},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"just give the model shell and tool access","work_id":"d4881466-cfe9-4f1b-95d7-3b358380ab45","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Context consumption feedback:After each tool call, the agent receives feedback about remaining context budget and approximate context occupancy. This is useful because the task spans thousands of turn","work_id":"c90e391d-d088-4e4f-880f-346a36ccc7cb","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"The goal is to make memory writing and retrieval deliberate actions rather than accidental byproducts of shell usage","work_id":"335abbb3-e733-4812-972b-67211b25b759","year":null}],"snapshot_sha256":"98adbddef2b0d635a05aa2a0371620041f76819bc33f8144f0c8c6d153cc870f"},"source":{"id":"2605.15188","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T03:08:02.403868Z","id":"ee187d39-2dbe-4949-9514-d3edbd034f12","model_set":{"reader":"grok-4.3"},"one_line_summary":"FutureSim is a benchmark that replays real news from January to March 2026 for AI agents to forecast events, with top accuracy at 25% and some agents worse than no-prediction baselines on Brier skill score.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"FutureSim evaluates AI agents by replaying real historical events in order and shows even the best achieve only 25 percent accuracy on future predictions.","strongest_claim":"FutureSim reveals a clear separation in their capabilities, with the best agent's accuracy being 25%, and many having worse Brier skill score than making no prediction at all.","weakest_assumption":"That replaying real historical events chronologically without future knowledge leakage accurately measures an agent's adaptive capabilities in open-ended real-world settings."}},"verdict_id":"ee187d39-2dbe-4949-9514-d3edbd034f12"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:4c00654f27a51e09a52cb90daf907df5501266081dfe481c04e4edc18c536d1f","target":"record","created_at":"2026-05-17T21:18:32Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f79f3e8bfe083d3301c4d1dae9a620b2c49a6264f13323f074fd97ad4e825d76","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T17:59:28Z","title_canon_sha256":"131de9b90c4210166213f7230b50e3513bf7fc6742b5a6d98d95edbdd3897002"},"schema_version":"1.0","source":{"id":"2605.15188","kind":"arxiv","version":1}},"canonical_sha256":"b43de6825f84530b3f877e171869bc94eab8d04bd392bd457dcb524b154deb46","receipt":{"builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b43de6825f84530b3f877e171869bc94eab8d04bd392bd457dcb524b154deb46","first_computed_at":"2026-05-17T21:40:25.093331Z","kind":"pith_receipt","last_reissued_at":"2026-05-17T21:57:18.480573Z","receipt_version":"0.2","signature_status":"unsigned_v0"},"source_id":"2605.15188","source_kind":"arxiv","source_version":1}}},"equivocations":[{"signer_id":"pith.science","event_type":"integrity_finding","target":"integrity","event_ids":["sha256:9376585e53be99e7ad9bfb181faccef8df6751b9474f5e08c3d84889c4904a10","sha256:be501cef01e29df3567b38a265a509161d45d438054ee5d4be07e5bfbb681825","sha256:c95400052c9be07bd32b18681e6a3ba5f8b6d386d9aad63ffa1dfb7945be25b6"]}],"invalid_events":[],"applied_event_ids":["sha256:4c00654f27a51e09a52cb90daf907df5501266081dfe481c04e4edc18c536d1f","sha256:8fb291469dd5e5b57ff8bd31e340d4efb6d813b50f07afc694e4523ef7004bb5"],"state_sha256":"2fb837dd57b39f1230e2d9ad5b8eb5092237e084fc858655377434c4426ebfc9"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"OQkJX0nlLzlE6/wev/AO1Buw3YUOeYu8M3HkIb8iSCrY0gbeSy4JgUAqbtVe+r3gAb59x4lzD3DgIS8A++LDAQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-20T11:16:51.685606Z","bundle_sha256":"edad4e0cfa979b8dd6a221fbcbc8cc48e65e7d0de4c440d60af9d929887b913b"}}