{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:RX25OYSZQAVF5OYQF4QZLFJZAJ","short_pith_number":"pith:RX25OYSZ","canonical_record":{"source":{"id":"2604.27604","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-04-30T08:57:18Z","cross_cats_sorted":["cs.CE"],"title_canon_sha256":"818073f2463a46cb1a02677f9b60fabb582f65bbe2f9c5f99d588c77af9dea61","abstract_canon_sha256":"ae953b014ecf4a05949ea6189d25d618ac581716eddb0d5331dafc500d53350b"},"schema_version":"1.0"},"canonical_sha256":"8df5d76259802a5ebb102f21959539024ada879e5eb2e6e5082f5c167b286a8b","source":{"kind":"arxiv","id":"2604.27604","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.27604","created_at":"2026-05-27T01:05:55Z"},{"alias_kind":"arxiv_version","alias_value":"2604.27604v2","created_at":"2026-05-27T01:05:55Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.27604","created_at":"2026-05-27T01:05:55Z"},{"alias_kind":"pith_short_12","alias_value":"RX25OYSZQAVF","created_at":"2026-05-27T01:05:55Z"},{"alias_kind":"pith_short_16","alias_value":"RX25OYSZQAVF5OYQ","created_at":"2026-05-27T01:05:55Z"},{"alias_kind":"pith_short_8","alias_value":"RX25OYSZ","created_at":"2026-05-27T01:05:55Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:RX25OYSZQAVF5OYQF4QZLFJZAJ","target":"record","payload":{"canonical_record":{"source":{"id":"2604.27604","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-04-30T08:57:18Z","cross_cats_sorted":["cs.CE"],"title_canon_sha256":"818073f2463a46cb1a02677f9b60fabb582f65bbe2f9c5f99d588c77af9dea61","abstract_canon_sha256":"ae953b014ecf4a05949ea6189d25d618ac581716eddb0d5331dafc500d53350b"},"schema_version":"1.0"},"canonical_sha256":"8df5d76259802a5ebb102f21959539024ada879e5eb2e6e5082f5c167b286a8b","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T01:05:55.425161Z","signature_b64":"wasy1hVY98nqtPbli6kdJ0mFqgorKbtnUo4kzVmJMcYQiinmgG6t9khGbcPSor3rj1fmO8H/1xekNl7AFM93AA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8df5d76259802a5ebb102f21959539024ada879e5eb2e6e5082f5c167b286a8b","last_reissued_at":"2026-05-27T01:05:55.424421Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T01:05:55.424421Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2604.27604","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T01:05:55Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Y3yE71mXmpJYxzXar/cBXAwqJWK8RNn37qM3hBCymYaQUoD2Y90vdwR3MiWu8yyojmAyRId+BytQ5UzPcgA9CA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T21:50:14.350578Z"},"content_sha256":"66eea68d4f2c1195d9412859e93dabeb7ff57bd0f21b1ed7f862b223bbf6877e","schema_version":"1.0","event_id":"sha256:66eea68d4f2c1195d9412859e93dabeb7ff57bd0f21b1ed7f862b223bbf6877e"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:RX25OYSZQAVF5OYQF4QZLFJZAJ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Decoding Scientific Experimental Images: The SPUR Benchmark for Perception, Understanding, and Reasoning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Current multimodal AI models fall significantly short of expert-level performance when interpreting scientific experimental images.","cross_cats":["cs.CE"],"primary_cat":"cs.CV","authors_text":"Haihong E, Haiyang Sun, Haocheng Gao, Haolin Tian, Jiacheng Liu, Jintong Chen, Junpeng Ding, Mengyuan Ji, Peizhi Zhao, Pengqi Sun, Rongjin Li, Ruomeng Jiang, Siying Lin, Yang Liu, Yang Xu, Yichen Liu, Yuanze Li, Zhongjun Yang, Zichen Tang, Zijie Xi","submitted_at":"2026-04-30T08:57:18Z","abstract_excerpt":"We introduce SPUR, a comprehensive benchmark for scientific experimental image perception, understanding, and reasoning, comprising 4,264 question-answering (QA) pairs derived from 1,084 expert-curated images. SPUR features three key innovations: (1) Panel-Level Fine-Grained Perception: evaluating the visual perception of multimodal large language models (MLLMs) across three dimensions (numerical, morphological, and information localization) on six fine-grained panel types; (2) Cross-Panel Relation Understanding: utilizing complex images with an average of 14.3 panels per sample to evaluate ML"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Comprehensive evaluation of 20 MLLMs and four multimodal Chain-of-Thought (MCoT) methods reveals that current models fall significantly short of the expert-level requirements for scientific image interpretation, underscoring a critical bottleneck in AI for Science (AI4S) research.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that the expert-curated images, panel classifications, and generated QA pairs accurately and without bias represent the full range of expert-level perception, cross-panel understanding, and reasoning required for scientific experimental images.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SPUR benchmark reveals that current multimodal large language models significantly underperform on expert-level perception, cross-panel understanding, and reasoning tasks with complex scientific experimental images.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Current multimodal AI models fall significantly short of expert-level performance when interpreting scientific experimental images.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"4624efcd2eed3ea7d555d61eab9de1987ec4e4a1145d586bf70a6cc07af9d161"},"source":{"id":"2604.27604","kind":"arxiv","version":2},"verdict":{"id":"04e7dc5f-7ff0-4331-bcae-dcda657eb12d","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-07T06:49:23.383646Z","strongest_claim":"Comprehensive evaluation of 20 MLLMs and four multimodal Chain-of-Thought (MCoT) methods reveals that current models fall significantly short of the expert-level requirements for scientific image interpretation, underscoring a critical bottleneck in AI for Science (AI4S) research.","one_line_summary":"SPUR benchmark reveals that current multimodal large language models significantly underperform on expert-level perception, cross-panel understanding, and reasoning tasks with complex scientific experimental images.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that the expert-curated images, panel classifications, and generated QA pairs accurately and without bias represent the full range of expert-level perception, cross-panel understanding, and reasoning required for scientific experimental images.","pith_extraction_headline":"Current multimodal AI models fall significantly short of expert-level performance when interpreting scientific experimental images."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.27604/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"ai_meta_artifact","ran_at":"2026-05-20T21:43:38.076614Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T19:04:00.127223Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"2e8fc36ddad99c8638383b57aa0472441f64832ed99d64fd1259d2c17d4f1c7e"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"04e7dc5f-7ff0-4331-bcae-dcda657eb12d"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T01:05:55Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"np/cuSUAluFnLK6OWxwk0LrJh5f8JuyOgD/gudCfZT6D/Jmj56s3eHJxClBmo4zs0Dy3oCXP5EJTB/dnhkpBAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T21:50:14.351070Z"},"content_sha256":"f1eb1ffbf8d3060516b1558997f3e8f420a36b2f778b67f3627c2aa307abae52","schema_version":"1.0","event_id":"sha256:f1eb1ffbf8d3060516b1558997f3e8f420a36b2f778b67f3627c2aa307abae52"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/RX25OYSZQAVF5OYQF4QZLFJZAJ/bundle.json","state_url":"https://pith.science/pith/RX25OYSZQAVF5OYQF4QZLFJZAJ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/RX25OYSZQAVF5OYQF4QZLFJZAJ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-01T21:50:14Z","links":{"resolver":"https://pith.science/pith/RX25OYSZQAVF5OYQF4QZLFJZAJ","bundle":"https://pith.science/pith/RX25OYSZQAVF5OYQF4QZLFJZAJ/bundle.json","state":"https://pith.science/pith/RX25OYSZQAVF5OYQF4QZLFJZAJ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/RX25OYSZQAVF5OYQF4QZLFJZAJ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:RX25OYSZQAVF5OYQF4QZLFJZAJ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"ae953b014ecf4a05949ea6189d25d618ac581716eddb0d5331dafc500d53350b","cross_cats_sorted":["cs.CE"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-04-30T08:57:18Z","title_canon_sha256":"818073f2463a46cb1a02677f9b60fabb582f65bbe2f9c5f99d588c77af9dea61"},"schema_version":"1.0","source":{"id":"2604.27604","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.27604","created_at":"2026-05-27T01:05:55Z"},{"alias_kind":"arxiv_version","alias_value":"2604.27604v2","created_at":"2026-05-27T01:05:55Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.27604","created_at":"2026-05-27T01:05:55Z"},{"alias_kind":"pith_short_12","alias_value":"RX25OYSZQAVF","created_at":"2026-05-27T01:05:55Z"},{"alias_kind":"pith_short_16","alias_value":"RX25OYSZQAVF5OYQ","created_at":"2026-05-27T01:05:55Z"},{"alias_kind":"pith_short_8","alias_value":"RX25OYSZ","created_at":"2026-05-27T01:05:55Z"}],"graph_snapshots":[{"event_id":"sha256:f1eb1ffbf8d3060516b1558997f3e8f420a36b2f778b67f3627c2aa307abae52","target":"graph","created_at":"2026-05-27T01:05:55Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Comprehensive evaluation of 20 MLLMs and four multimodal Chain-of-Thought (MCoT) methods reveals that current models fall significantly short of the expert-level requirements for scientific image interpretation, underscoring a critical bottleneck in AI for Science (AI4S) research."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that the expert-curated images, panel classifications, and generated QA pairs accurately and without bias represent the full range of expert-level perception, cross-panel understanding, and reasoning required for scientific experimental images."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SPUR benchmark reveals that current multimodal large language models significantly underperform on expert-level perception, cross-panel understanding, and reasoning tasks with complex scientific experimental images."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Current multimodal AI models fall significantly short of expert-level performance when interpreting scientific experimental images."}],"snapshot_sha256":"4624efcd2eed3ea7d555d61eab9de1987ec4e4a1145d586bf70a6cc07af9d161"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-20T21:43:38.076614Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T19:04:00.127223Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2604.27604/integrity.json","findings":[],"snapshot_sha256":"2e8fc36ddad99c8638383b57aa0472441f64832ed99d64fd1259d2c17d4f1c7e","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"We introduce SPUR, a comprehensive benchmark for scientific experimental image perception, understanding, and reasoning, comprising 4,264 question-answering (QA) pairs derived from 1,084 expert-curated images. SPUR features three key innovations: (1) Panel-Level Fine-Grained Perception: evaluating the visual perception of multimodal large language models (MLLMs) across three dimensions (numerical, morphological, and information localization) on six fine-grained panel types; (2) Cross-Panel Relation Understanding: utilizing complex images with an average of 14.3 panels per sample to evaluate ML","authors_text":"Haihong E, Haiyang Sun, Haocheng Gao, Haolin Tian, Jiacheng Liu, Jintong Chen, Junpeng Ding, Mengyuan Ji, Peizhi Zhao, Pengqi Sun, Rongjin Li, Ruomeng Jiang, Siying Lin, Yang Liu, Yang Xu, Yichen Liu, Yuanze Li, Zhongjun Yang, Zichen Tang, Zijie Xi","cross_cats":["cs.CE"],"headline":"Current multimodal AI models fall significantly short of expert-level performance when interpreting scientific experimental images.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-04-30T08:57:18Z","title":"Decoding Scientific Experimental Images: The SPUR Benchmark for Perception, Understanding, and Reasoning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2604.27604","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-07T06:49:23.383646Z","id":"04e7dc5f-7ff0-4331-bcae-dcda657eb12d","model_set":{"reader":"grok-4.3"},"one_line_summary":"SPUR benchmark reveals that current multimodal large language models significantly underperform on expert-level perception, cross-panel understanding, and reasoning tasks with complex scientific experimental images.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Current multimodal AI models fall significantly short of expert-level performance when interpreting scientific experimental images.","strongest_claim":"Comprehensive evaluation of 20 MLLMs and four multimodal Chain-of-Thought (MCoT) methods reveals that current models fall significantly short of the expert-level requirements for scientific image interpretation, underscoring a critical bottleneck in AI for Science (AI4S) research.","weakest_assumption":"The assumption that the expert-curated images, panel classifications, and generated QA pairs accurately and without bias represent the full range of expert-level perception, cross-panel understanding, and reasoning required for scientific experimental images."}},"verdict_id":"04e7dc5f-7ff0-4331-bcae-dcda657eb12d"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:66eea68d4f2c1195d9412859e93dabeb7ff57bd0f21b1ed7f862b223bbf6877e","target":"record","created_at":"2026-05-27T01:05:55Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"ae953b014ecf4a05949ea6189d25d618ac581716eddb0d5331dafc500d53350b","cross_cats_sorted":["cs.CE"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-04-30T08:57:18Z","title_canon_sha256":"818073f2463a46cb1a02677f9b60fabb582f65bbe2f9c5f99d588c77af9dea61"},"schema_version":"1.0","source":{"id":"2604.27604","kind":"arxiv","version":2}},"canonical_sha256":"8df5d76259802a5ebb102f21959539024ada879e5eb2e6e5082f5c167b286a8b","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"8df5d76259802a5ebb102f21959539024ada879e5eb2e6e5082f5c167b286a8b","first_computed_at":"2026-05-27T01:05:55.424421Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-27T01:05:55.424421Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"wasy1hVY98nqtPbli6kdJ0mFqgorKbtnUo4kzVmJMcYQiinmgG6t9khGbcPSor3rj1fmO8H/1xekNl7AFM93AA==","signature_status":"signed_v1","signed_at":"2026-05-27T01:05:55.425161Z","signed_message":"canonical_sha256_bytes"},"source_id":"2604.27604","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:66eea68d4f2c1195d9412859e93dabeb7ff57bd0f21b1ed7f862b223bbf6877e","sha256:f1eb1ffbf8d3060516b1558997f3e8f420a36b2f778b67f3627c2aa307abae52"],"state_sha256":"2bf60628f203bbc964a6c73b1c73bb4e9114f3f90c260f90d97b016682329da3"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ZpusNSysdWhNa5qE04W0L3QkIg7UoxY4m1CXHhwB6LKWw9NSUSIhuKogPWelT0QZZ5jzGOTAlVZ7I5mRT1IaDQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-01T21:50:14.353470Z","bundle_sha256":"9de398cd122214b31430421cfc426350f885c5635372832a82dbd692d0496ede"}}