{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:WQ66NAS7QRJQWP4HPYLRQ2N4ST","merge_version":"pith-open-graph-merge-v1","event_count":5,"valid_event_count":5,"invalid_event_count":0,"equivocation_count":1,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f79f3e8bfe083d3301c4d1dae9a620b2c49a6264f13323f074fd97ad4e825d76","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T17:59:28Z","title_canon_sha256":"131de9b90c4210166213f7230b50e3513bf7fc6742b5a6d98d95edbdd3897002"},"schema_version":"1.0","source":{"id":"2605.15188","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15188","created_at":"2026-05-17T21:18:32Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15188v1","created_at":"2026-05-17T21:18:32Z"},{"alias_kind":"pith_short_12","alias_value":"WQ66NAS7QRJQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WQ66NAS7QRJQWP4H","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WQ66NAS7","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8fb291469dd5e5b57ff8bd31e340d4efb6d813b50f07afc694e4523ef7004bb5","target":"graph","created_at":"2026-05-17T21:57:18Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"FutureSim reveals a clear separation in their capabilities, with the best agent's accuracy being 25%, and many having worse Brier skill score than making no prediction at all."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That replaying real historical events chronologically without future knowledge leakage accurately measures an agent's adaptive capabilities in open-ended real-world settings."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"FutureSim is a benchmark that replays real news from January to March 2026 for AI agents to forecast events, with top accuracy at 25% and some agents worse than no-prediction baselines on Brier skill score."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"FutureSim evaluates AI agents by replaying real historical events in order and shows even the best achieve only 25 percent accuracy on future predictions."}],"snapshot_sha256":"162a363276cac9ee69424ab8967097dedc85f9a09d938c5ff0d36ca3518e1ce7"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"AI agents are being increasingly deployed in dynamic, open-ended environments that require adapting to new information as it arrives. To efficiently measure this capability for realistic use-cases, we propose building grounded simulations that replay real-world events in the order they occurred. We build FutureSim, where agents forecast world events beyond their knowledge cutoff while interacting with a chronological replay of the world: real news articles arriving and questions resolving over the simulated period. We evaluate frontier agents in their native harness, testing their ability to p","authors_text":"Ameya Prabhu, Arvindh Arun, Jonas Geiping, Maksym Andriushchenko, Moritz Hardt, Nikhil Chandak, Shashwat Goel, Steffen Staab","cross_cats":["cs.AI","cs.CL"],"headline":"FutureSim evaluates AI agents by replaying real historical events in order and shows even the best achieve only 25 percent accuracy on future predictions.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T17:59:28Z","title":"FutureSim: Replaying World Events to Evaluate Adaptive Agents"},"references":{"count":25,"internal_anchors":0,"resolved_work":25,"sample":[{"cited_arxiv_id":"","doi":"10.5281/zenodo.1207631","is_internal_anchor":false,"ref_index":1,"title":"World models","work_id":"74007479-6f51-4839-ae30-4d6122d21c36","year":2018},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Lost in simulation: Llm-simulated users are unreliable proxies for human users in agentic evaluations","work_id":"1e07215b-a63b-418a-8387-6f3ccef28361","year":2026},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"just give the model shell and tool access","work_id":"d4881466-cfe9-4f1b-95d7-3b358380ab45","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Context consumption feedback:After each tool call, the agent receives feedback about remaining context budget and approximate context occupancy. This is useful because the task spans thousands of turn","work_id":"c90e391d-d088-4e4f-880f-346a36ccc7cb","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"The goal is to make memory writing and retrieval deliberate actions rather than accidental byproducts of shell usage","work_id":"335abbb3-e733-4812-972b-67211b25b759","year":null}],"snapshot_sha256":"98adbddef2b0d635a05aa2a0371620041f76819bc33f8144f0c8c6d153cc870f"},"source":{"id":"2605.15188","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T03:08:02.403868Z","id":"ee187d39-2dbe-4949-9514-d3edbd034f12","model_set":{"reader":"grok-4.3"},"one_line_summary":"FutureSim is a benchmark that replays real news from January to March 2026 for AI agents to forecast events, with top accuracy at 25% and some agents worse than no-prediction baselines on Brier skill score.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"FutureSim evaluates AI agents by replaying real historical events in order and shows even the best achieve only 25 percent accuracy on future predictions.","strongest_claim":"FutureSim reveals a clear separation in their capabilities, with the best agent's accuracy being 25%, and many having worse Brier skill score than making no prediction at all.","weakest_assumption":"That replaying real historical events chronologically without future knowledge leakage accurately measures an agent's adaptive capabilities in open-ended real-world settings."}},"verdict_id":"ee187d39-2dbe-4949-9514-d3edbd034f12"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:4c00654f27a51e09a52cb90daf907df5501266081dfe481c04e4edc18c536d1f","target":"record","created_at":"2026-05-17T21:18:32Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f79f3e8bfe083d3301c4d1dae9a620b2c49a6264f13323f074fd97ad4e825d76","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T17:59:28Z","title_canon_sha256":"131de9b90c4210166213f7230b50e3513bf7fc6742b5a6d98d95edbdd3897002"},"schema_version":"1.0","source":{"id":"2605.15188","kind":"arxiv","version":1}},"canonical_sha256":"b43de6825f84530b3f877e171869bc94eab8d04bd392bd457dcb524b154deb46","receipt":{"builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b43de6825f84530b3f877e171869bc94eab8d04bd392bd457dcb524b154deb46","first_computed_at":"2026-05-17T21:40:25.093331Z","kind":"pith_receipt","last_reissued_at":"2026-05-17T21:57:18.480573Z","receipt_version":"0.2","signature_status":"unsigned_v0"},"source_id":"2605.15188","source_kind":"arxiv","source_version":1}}},"equivocations":[{"signer_id":"pith.science","event_type":"integrity_finding","target":"integrity","event_ids":["sha256:9376585e53be99e7ad9bfb181faccef8df6751b9474f5e08c3d84889c4904a10","sha256:be501cef01e29df3567b38a265a509161d45d438054ee5d4be07e5bfbb681825","sha256:c95400052c9be07bd32b18681e6a3ba5f8b6d386d9aad63ffa1dfb7945be25b6"]}],"invalid_events":[],"applied_event_ids":["sha256:4c00654f27a51e09a52cb90daf907df5501266081dfe481c04e4edc18c536d1f","sha256:8fb291469dd5e5b57ff8bd31e340d4efb6d813b50f07afc694e4523ef7004bb5"],"state_sha256":"2fb837dd57b39f1230e2d9ad5b8eb5092237e084fc858655377434c4426ebfc9"}