{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:Q2EHECB53ISG2NKI3ZEONCMJHK","short_pith_number":"pith:Q2EHECB5","schema_version":"1.0","canonical_sha256":"868872083dda246d3548de48e689893aa0b1fa7a32f985a4e5d9e8055dbe063a","source":{"kind":"arxiv","id":"2605.13542","version":1},"attestation_state":"computed","paper":{"title":"RealICU: Do LLM Agents Understand Long-Context ICU Data? A Benchmark Beyond Behavior Imitation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Large language models perform poorly on realistic long-context ICU data, revealing recall-safety tradeoffs and anchoring biases in clinical reasoning.","cross_cats":["cs.CL","cs.LG","cs.MA"],"primary_cat":"cs.AI","authors_text":"Chen (Cherise) Chen, Chengzhi Shen, Daniel Rueckert, Jiazhen Pan, Jun Li, Tobias Susetzky, Weixiang Shen, Xuepeng Zhang, Yuyuan Liu, Zhenyu Gong","submitted_at":"2026-05-13T13:52:42Z","abstract_excerpt":"Intensive care units (ICU) generate long, dense and evolving streams of clinical information, where physicians must repeatedly reassess patient states under time pressure, underscoring a clear need for reliable AI decision support. Existing ICU benchmarks typically treat historical clinician actions as ground truth. However, these actions are made under incomplete information and limited temporal context of the underlying patient state, and may therefore be suboptimal, making it difficult to assess the true reasoning capabilities of AI systems. We introduce RealICU, a hindsight-annotated bench"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.13542","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T13:52:42Z","cross_cats_sorted":["cs.CL","cs.LG","cs.MA"],"title_canon_sha256":"516a2547d9b5616c9ec4bbe1fd0364d539bd38e6af1b8f2f84ca259cad48900c","abstract_canon_sha256":"ebf3cc96a0ae6e82b7385eb942376cf40249049280662b4bcdd3b33b57c39deb"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:44:24.014531Z","signature_b64":"IBPEquBuaCQCpNjO44WB90VdyTExu4oo/awyHWIXyFdVZuVMNb9iVbsin3rP3s9eDvZLoEjiAiUqYv3U3j4iAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"868872083dda246d3548de48e689893aa0b1fa7a32f985a4e5d9e8055dbe063a","last_reissued_at":"2026-05-18T02:44:24.014076Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:44:24.014076Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"RealICU: Do LLM Agents Understand Long-Context ICU Data? A Benchmark Beyond Behavior Imitation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Large language models perform poorly on realistic long-context ICU data, revealing recall-safety tradeoffs and anchoring biases in clinical reasoning.","cross_cats":["cs.CL","cs.LG","cs.MA"],"primary_cat":"cs.AI","authors_text":"Chen (Cherise) Chen, Chengzhi Shen, Daniel Rueckert, Jiazhen Pan, Jun Li, Tobias Susetzky, Weixiang Shen, Xuepeng Zhang, Yuyuan Liu, Zhenyu Gong","submitted_at":"2026-05-13T13:52:42Z","abstract_excerpt":"Intensive care units (ICU) generate long, dense and evolving streams of clinical information, where physicians must repeatedly reassess patient states under time pressure, underscoring a clear need for reliable AI decision support. Existing ICU benchmarks typically treat historical clinician actions as ground truth. However, these actions are made under incomplete information and limited temporal context of the underlying patient state, and may therefore be suboptimal, making it difficult to assess the true reasoning capabilities of AI systems. We introduce RealICU, a hindsight-annotated bench"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Existing LLMs including memory-augmented ones performed poorly on RealICU, exposing two failure modes: a recall-safety tradeoff for clinical recommendations, and an anchoring bias to early interpretations of the patient.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That senior physicians' hindsight review of full trajectories produces reliable ground-truth labels for optimal actions and red flags, despite the original clinicians operating under incomplete real-time information.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"RealICU is a new benchmark using physician hindsight labels on MIMIC-IV ICU data that exposes LLM failures in long-horizon clinical assessment, acute problem detection, action recommendation, and red-flag identification.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Large language models perform poorly on realistic long-context ICU data, revealing recall-safety tradeoffs and anchoring biases in clinical reasoning.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"d88ef7957041bb07bb48e27f59797e21433eac64c71abcfd4844d582109be72d"},"source":{"id":"2605.13542","kind":"arxiv","version":1},"verdict":{"id":"fccf52fc-1e45-4a8d-8ff0-834948e1671c","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T18:44:47.113134Z","strongest_claim":"Existing LLMs including memory-augmented ones performed poorly on RealICU, exposing two failure modes: a recall-safety tradeoff for clinical recommendations, and an anchoring bias to early interpretations of the patient.","one_line_summary":"RealICU is a new benchmark using physician hindsight labels on MIMIC-IV ICU data that exposes LLM failures in long-horizon clinical assessment, acute problem detection, action recommendation, and red-flag identification.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That senior physicians' hindsight review of full trajectories produces reliable ground-truth labels for optimal actions and red flags, despite the original clinicians operating under incomplete real-time information.","pith_extraction_headline":"Large language models perform poorly on realistic long-context ICU data, revealing recall-safety tradeoffs and anchoring biases in clinical reasoning."},"references":{"count":54,"sample":[{"doi":"","year":2024,"title":"A survey on rag with llms.Procedia computer science, 246:3781–3790, 2024","work_id":"ec801a64-097f-403e-badc-125a03e7fbc8","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1998,"title":"Anthony Rocco Cassandra.Exact and approximate algorithms for partially observable Markov decision processes. Brown University, 1998","work_id":"4e44e4a8-5a9e-44df-b4aa-792173c55630","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Simulating viva voce examinations to evaluate clinical reasoning in large language models.arXiv preprint arXiv:2510.10278, 2025","work_id":"e2044fd8-04c9-4d67-b22b-edcb386436cf","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"The power of noise: Redefining retrieval for rag systems","work_id":"6bd8d515-c8fa-437a-8304-ba7ba61da0be","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Machine learning model for early prediction of acute kidney injury (aki) in pediatric critical care.Critical Care, 25(1):288, 2021","work_id":"cb4f0deb-b5cf-4e47-8604-7aeb7db4e1f8","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":54,"snapshot_sha256":"7a5902936d6fb30d92ef59947186be5a91eeb3409e027a4d498edf98d914364c","internal_anchors":4},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.13542","created_at":"2026-05-18T02:44:24.014157+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.13542v1","created_at":"2026-05-18T02:44:24.014157+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13542","created_at":"2026-05-18T02:44:24.014157+00:00"},{"alias_kind":"pith_short_12","alias_value":"Q2EHECB53ISG","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"Q2EHECB53ISG2NKI","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"Q2EHECB5","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Q2EHECB53ISG2NKI3ZEONCMJHK","json":"https://pith.science/pith/Q2EHECB53ISG2NKI3ZEONCMJHK.json","graph_json":"https://pith.science/api/pith-number/Q2EHECB53ISG2NKI3ZEONCMJHK/graph.json","events_json":"https://pith.science/api/pith-number/Q2EHECB53ISG2NKI3ZEONCMJHK/events.json","paper":"https://pith.science/paper/Q2EHECB5"},"agent_actions":{"view_html":"https://pith.science/pith/Q2EHECB53ISG2NKI3ZEONCMJHK","download_json":"https://pith.science/pith/Q2EHECB53ISG2NKI3ZEONCMJHK.json","view_paper":"https://pith.science/paper/Q2EHECB5","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.13542&json=true","fetch_graph":"https://pith.science/api/pith-number/Q2EHECB53ISG2NKI3ZEONCMJHK/graph.json","fetch_events":"https://pith.science/api/pith-number/Q2EHECB53ISG2NKI3ZEONCMJHK/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Q2EHECB53ISG2NKI3ZEONCMJHK/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Q2EHECB53ISG2NKI3ZEONCMJHK/action/storage_attestation","attest_author":"https://pith.science/pith/Q2EHECB53ISG2NKI3ZEONCMJHK/action/author_attestation","sign_citation":"https://pith.science/pith/Q2EHECB53ISG2NKI3ZEONCMJHK/action/citation_signature","submit_replication":"https://pith.science/pith/Q2EHECB53ISG2NKI3ZEONCMJHK/action/replication_record"}},"created_at":"2026-05-18T02:44:24.014157+00:00","updated_at":"2026-05-18T02:44:24.014157+00:00"}