{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:4O7HQCVOBKXK7AXX7NVR7JHMEM","short_pith_number":"pith:4O7HQCVO","canonical_record":{"source":{"id":"2408.13257","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-08-23T17:59:51Z","cross_cats_sorted":[],"title_canon_sha256":"b579ae444d9a4bd2c060336112ea901912da858435643d1dae227d8eabb9fa89","abstract_canon_sha256":"5d3bbb8b38c0d16507887f6e562134f837670dcf21be01c1811979ce43518d33"},"schema_version":"1.0"},"canonical_sha256":"e3be780aae0aaeaf82f7fb6b1fa4ec232acef96c97153915e475c39bf8505b35","source":{"kind":"arxiv","id":"2408.13257","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2408.13257","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2408.13257v3","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2408.13257","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"4O7HQCVOBKXK","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"4O7HQCVOBKXK7AXX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"4O7HQCVO","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:4O7HQCVOBKXK7AXX7NVR7JHMEM","target":"record","payload":{"canonical_record":{"source":{"id":"2408.13257","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-08-23T17:59:51Z","cross_cats_sorted":[],"title_canon_sha256":"b579ae444d9a4bd2c060336112ea901912da858435643d1dae227d8eabb9fa89","abstract_canon_sha256":"5d3bbb8b38c0d16507887f6e562134f837670dcf21be01c1811979ce43518d33"},"schema_version":"1.0"},"canonical_sha256":"e3be780aae0aaeaf82f7fb6b1fa4ec232acef96c97153915e475c39bf8505b35","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:48.585247Z","signature_b64":"47xa0Nieyv6XBgIH3DWRsIsLgnu6TLkAVI9CSEwBRtYD+5l0D0rjA5htEED7beRG9YCUEb/0MebrQVk9sXZoBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e3be780aae0aaeaf82f7fb6b1fa4ec232acef96c97153915e475c39bf8505b35","last_reissued_at":"2026-05-17T23:38:48.584764Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:48.584764Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2408.13257","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ODTsmAH/JiaNRXYcVqofnj0Mm5rWcS6SeRwo2qGCNlhoFR130zO6jfNIhKkqNnKolWpXOQeBErRyZ5Xs/9EkCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T18:30:37.753969Z"},"content_sha256":"2a2ef4179d7b6fac5a64322db764dceaa5251eea1c88cc07c3d87eee7f23a6a6","schema_version":"1.0","event_id":"sha256:2a2ef4179d7b6fac5a64322db764dceaa5251eea1c88cc07c3d87eee7f23a6a6"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:4O7HQCVOBKXK7AXX7NVR7JHMEM","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"MME-RealWorld: Could Your Multimodal LLM Challenge High-Resolution Real-World Scenarios that are Difficult for Humans?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Even the strongest multimodal LLMs fail to reach 60 percent accuracy on high-resolution real-world tasks","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chaoyou Fu, Feng Li, Haochen Tian, Huanyu Zhang, Junfei Wu, Kun Wang, Liang Wang, Qingsong Wen, Rong Jin, Shuangqing Zhang, Tieniu Tan, Yi-Fan Zhang, Zhang Zhang","submitted_at":"2024-08-23T17:59:51Z","abstract_excerpt":"Comprehensive evaluation of Multimodal Large Language Models (MLLMs) has recently garnered widespread attention in the research community. However, we observe that existing benchmarks present several common barriers that make it difficult to measure the significant challenges that models face in the real world, including: 1) small data scale leads to a large performance variance; 2) reliance on model-based annotations results in restricted data quality; 3) insufficient task difficulty, especially caused by the limited image resolution. To tackle these issues, we introduce MME-RealWorld. Specif"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"even the most advanced models struggle with our benchmarks, where none of them reach 60% accuracy","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The 13,366 filtered images and 29,429 QA pairs created by 25 annotators and 7 experts truly represent high-resolution real-world scenarios that are extremely challenging even for humans","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"MME-RealWorld is the largest manually annotated high-resolution benchmark for MLLMs, where even the best models achieve less than 60% accuracy on challenging real-world tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Even the strongest multimodal LLMs fail to reach 60 percent accuracy on high-resolution real-world tasks","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"3c47290efa41b95378566949bf5bdfadf1ed82c7fbcd54aca2ab551762d88069"},"source":{"id":"2408.13257","kind":"arxiv","version":3},"verdict":{"id":"1ceb05d4-0bd9-4014-b944-232cb8215e04","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T07:55:05.575672Z","strongest_claim":"even the most advanced models struggle with our benchmarks, where none of them reach 60% accuracy","one_line_summary":"MME-RealWorld is the largest manually annotated high-resolution benchmark for MLLMs, where even the best models achieve less than 60% accuracy on challenging real-world tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The 13,366 filtered images and 29,429 QA pairs created by 25 annotators and 7 experts truly represent high-resolution real-world scenarios that are extremely challenging even for humans","pith_extraction_headline":"Even the strongest multimodal LLMs fail to reach 60 percent accuracy on high-resolution real-world tasks"},"references":{"count":102,"sample":[{"doi":"","year":2017,"title":"Ntire 2017 challenge on single image super-resolution: Dataset and study","work_id":"653430b4-6f63-4484-ad5e-cfb4134b51af","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"PaLM 2 Technical Report","work_id":"905ee9a7-ea61-4a94-bd62-2600cbe3e315","ref_index":2,"cited_arxiv_id":"2305.10403","is_internal_anchor":true},{"doi":"","year":2023,"title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive Vision-Language Models","work_id":"87bfa84a-e663-4165-806f-93ef439d88d0","ref_index":3,"cited_arxiv_id":"2308.01390","is_internal_anchor":true},{"doi":"","year":2023,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","ref_index":4,"cited_arxiv_id":"2308.12966","is_internal_anchor":true},{"doi":"","year":2023,"title":"TouchStone: Evaluating vision-language models by language models","work_id":"e5349e70-f6c8-40ba-bb19-91bf273abf23","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":102,"snapshot_sha256":"4510d457b362ad27f40618e45ee15bb8950d5d60f46789b6041f72de17ff187e","internal_anchors":30},"formal_canon":{"evidence_count":1,"snapshot_sha256":"62c5e30e4546d7ea8c77a2b71b5db39b0227f0a3ca8e472161fcc4d1548a84d5"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"1ceb05d4-0bd9-4014-b944-232cb8215e04"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"40h7y6hXk361lgxo3FOUL1sVsCEjdp1xVTqU8dpAr+k803yBcvJCfDZz8A4pZ2Q4CCDrw4r2U6bSZGTFH40+BA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T18:30:37.754572Z"},"content_sha256":"0e782c518289844726cf7fd10b30ab8c07a0bea716999acf358d820e8c901b73","schema_version":"1.0","event_id":"sha256:0e782c518289844726cf7fd10b30ab8c07a0bea716999acf358d820e8c901b73"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/4O7HQCVOBKXK7AXX7NVR7JHMEM/bundle.json","state_url":"https://pith.science/pith/4O7HQCVOBKXK7AXX7NVR7JHMEM/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/4O7HQCVOBKXK7AXX7NVR7JHMEM/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T18:30:37Z","links":{"resolver":"https://pith.science/pith/4O7HQCVOBKXK7AXX7NVR7JHMEM","bundle":"https://pith.science/pith/4O7HQCVOBKXK7AXX7NVR7JHMEM/bundle.json","state":"https://pith.science/pith/4O7HQCVOBKXK7AXX7NVR7JHMEM/state.json","well_known_bundle":"https://pith.science/.well-known/pith/4O7HQCVOBKXK7AXX7NVR7JHMEM/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:4O7HQCVOBKXK7AXX7NVR7JHMEM","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5d3bbb8b38c0d16507887f6e562134f837670dcf21be01c1811979ce43518d33","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-08-23T17:59:51Z","title_canon_sha256":"b579ae444d9a4bd2c060336112ea901912da858435643d1dae227d8eabb9fa89"},"schema_version":"1.0","source":{"id":"2408.13257","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2408.13257","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2408.13257v3","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2408.13257","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"4O7HQCVOBKXK","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"4O7HQCVOBKXK7AXX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"4O7HQCVO","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:0e782c518289844726cf7fd10b30ab8c07a0bea716999acf358d820e8c901b73","target":"graph","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"even the most advanced models struggle with our benchmarks, where none of them reach 60% accuracy"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The 13,366 filtered images and 29,429 QA pairs created by 25 annotators and 7 experts truly represent high-resolution real-world scenarios that are extremely challenging even for humans"},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MME-RealWorld is the largest manually annotated high-resolution benchmark for MLLMs, where even the best models achieve less than 60% accuracy on challenging real-world tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Even the strongest multimodal LLMs fail to reach 60 percent accuracy on high-resolution real-world tasks"}],"snapshot_sha256":"3c47290efa41b95378566949bf5bdfadf1ed82c7fbcd54aca2ab551762d88069"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"62c5e30e4546d7ea8c77a2b71b5db39b0227f0a3ca8e472161fcc4d1548a84d5"},"paper":{"abstract_excerpt":"Comprehensive evaluation of Multimodal Large Language Models (MLLMs) has recently garnered widespread attention in the research community. However, we observe that existing benchmarks present several common barriers that make it difficult to measure the significant challenges that models face in the real world, including: 1) small data scale leads to a large performance variance; 2) reliance on model-based annotations results in restricted data quality; 3) insufficient task difficulty, especially caused by the limited image resolution. To tackle these issues, we introduce MME-RealWorld. Specif","authors_text":"Chaoyou Fu, Feng Li, Haochen Tian, Huanyu Zhang, Junfei Wu, Kun Wang, Liang Wang, Qingsong Wen, Rong Jin, Shuangqing Zhang, Tieniu Tan, Yi-Fan Zhang, Zhang Zhang","cross_cats":[],"headline":"Even the strongest multimodal LLMs fail to reach 60 percent accuracy on high-resolution real-world tasks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-08-23T17:59:51Z","title":"MME-RealWorld: Could Your Multimodal LLM Challenge High-Resolution Real-World Scenarios that are Difficult for Humans?"},"references":{"count":102,"internal_anchors":30,"resolved_work":102,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Ntire 2017 challenge on single image super-resolution: Dataset and study","work_id":"653430b4-6f63-4484-ad5e-cfb4134b51af","year":2017},{"cited_arxiv_id":"2305.10403","doi":"","is_internal_anchor":true,"ref_index":2,"title":"PaLM 2 Technical Report","work_id":"905ee9a7-ea61-4a94-bd62-2600cbe3e315","year":2023},{"cited_arxiv_id":"2308.01390","doi":"","is_internal_anchor":true,"ref_index":3,"title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive Vision-Language Models","work_id":"87bfa84a-e663-4165-806f-93ef439d88d0","year":2023},{"cited_arxiv_id":"2308.12966","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"TouchStone: Evaluating vision-language models by language models","work_id":"e5349e70-f6c8-40ba-bb19-91bf273abf23","year":2023}],"snapshot_sha256":"4510d457b362ad27f40618e45ee15bb8950d5d60f46789b6041f72de17ff187e"},"source":{"id":"2408.13257","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-16T07:55:05.575672Z","id":"1ceb05d4-0bd9-4014-b944-232cb8215e04","model_set":{"reader":"grok-4.3"},"one_line_summary":"MME-RealWorld is the largest manually annotated high-resolution benchmark for MLLMs, where even the best models achieve less than 60% accuracy on challenging real-world tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Even the strongest multimodal LLMs fail to reach 60 percent accuracy on high-resolution real-world tasks","strongest_claim":"even the most advanced models struggle with our benchmarks, where none of them reach 60% accuracy","weakest_assumption":"The 13,366 filtered images and 29,429 QA pairs created by 25 annotators and 7 experts truly represent high-resolution real-world scenarios that are extremely challenging even for humans"}},"verdict_id":"1ceb05d4-0bd9-4014-b944-232cb8215e04"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2a2ef4179d7b6fac5a64322db764dceaa5251eea1c88cc07c3d87eee7f23a6a6","target":"record","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5d3bbb8b38c0d16507887f6e562134f837670dcf21be01c1811979ce43518d33","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-08-23T17:59:51Z","title_canon_sha256":"b579ae444d9a4bd2c060336112ea901912da858435643d1dae227d8eabb9fa89"},"schema_version":"1.0","source":{"id":"2408.13257","kind":"arxiv","version":3}},"canonical_sha256":"e3be780aae0aaeaf82f7fb6b1fa4ec232acef96c97153915e475c39bf8505b35","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"e3be780aae0aaeaf82f7fb6b1fa4ec232acef96c97153915e475c39bf8505b35","first_computed_at":"2026-05-17T23:38:48.584764Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:48.584764Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"47xa0Nieyv6XBgIH3DWRsIsLgnu6TLkAVI9CSEwBRtYD+5l0D0rjA5htEED7beRG9YCUEb/0MebrQVk9sXZoBg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:48.585247Z","signed_message":"canonical_sha256_bytes"},"source_id":"2408.13257","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2a2ef4179d7b6fac5a64322db764dceaa5251eea1c88cc07c3d87eee7f23a6a6","sha256:0e782c518289844726cf7fd10b30ab8c07a0bea716999acf358d820e8c901b73"],"state_sha256":"e4909ee009b4c15fa7d9d2511916629cae1db78dfc4862fa4db889d1b3c00b4e"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"xbrX/djE27zwHEUbCiL3t/xy597v4wTYxRR6a98KO6602FxogzUpm9A6WbxR7xvoO/euK7l9mbVAiVzkgN6jBw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T18:30:37.757771Z","bundle_sha256":"d29915717e70147a5dc9e681553dc7130015778f5f8cdf8c471d7a4fa5970dc0"}}