{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:7YTNYMNUELMI4SR5F3BAH7D5KB","short_pith_number":"pith:7YTNYMNU","schema_version":"1.0","canonical_sha256":"fe26dc31b422d88e4a3d2ec203fc7d5061286d2fad3a439a3f0fa3536dda8d6a","source":{"kind":"arxiv","id":"2305.07895","version":7},"attestation_state":"computed","paper":{"title":"OCRBench: On the Hidden Mystery of OCR in Large Multimodal Models","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"OCRBench evaluates large multimodal models on 29 OCR datasets to expose their specific weaknesses in text recognition tasks.","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Biao Yang, Cheng-Lin Liu, Chunyuan Li, Lianwen Jin, Mingxin Huang, Wenwen Yu, Xiang Bai, Xucheng Yin, Yuliang Liu, Zhang Li","submitted_at":"2023-05-13T11:28:37Z","abstract_excerpt":"Large models have recently played a dominant role in natural language processing and multimodal vision-language learning. However, their effectiveness in text-related visual tasks remains relatively unexplored. In this paper, we conducted a comprehensive evaluation of Large Multimodal Models, such as GPT4V and Gemini, in various text-related visual tasks including Text Recognition, Scene Text-Centric Visual Question Answering (VQA), Document-Oriented VQA, Key Information Extraction (KIE), and Handwritten Mathematical Expression Recognition (HMER). To facilitate the assessment of Optical Charac"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2305.07895","kind":"arxiv","version":7},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2023-05-13T11:28:37Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"752f574bb06201a05cecfb174a1559187447237c80e2df7daf18a0cc7aad28f9","abstract_canon_sha256":"29d2b49f308511804352e18309295d4a3fa4576924d2b23a7b2a012b54df781f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:14.446549Z","signature_b64":"vSylaYAQFEq3RFAqshJelFdkQWw4oAKAAwAC8NQoSXRBcNnoWQoWzKlnDd6f0NquvBcj2WwLt85Y+diPx798DA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fe26dc31b422d88e4a3d2ec203fc7d5061286d2fad3a439a3f0fa3536dda8d6a","last_reissued_at":"2026-05-17T23:38:14.445847Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:14.445847Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"OCRBench: On the Hidden Mystery of OCR in Large Multimodal Models","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"OCRBench evaluates large multimodal models on 29 OCR datasets to expose their specific weaknesses in text recognition tasks.","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Biao Yang, Cheng-Lin Liu, Chunyuan Li, Lianwen Jin, Mingxin Huang, Wenwen Yu, Xiang Bai, Xucheng Yin, Yuliang Liu, Zhang Li","submitted_at":"2023-05-13T11:28:37Z","abstract_excerpt":"Large models have recently played a dominant role in natural language processing and multimodal vision-language learning. However, their effectiveness in text-related visual tasks remains relatively unexplored. In this paper, we conducted a comprehensive evaluation of Large Multimodal Models, such as GPT4V and Gemini, in various text-related visual tasks including Text Recognition, Scene Text-Centric Visual Question Answering (VQA), Document-Oriented VQA, Key Information Extraction (KIE), and Handwritten Mathematical Expression Recognition (HMER). To facilitate the assessment of Optical Charac"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"To facilitate the assessment of Optical Character Recognition (OCR) capabilities in Large Multimodal Models, we propose OCRBench, a comprehensive evaluation benchmark. OCRBench contains 29 datasets, making it the most comprehensive OCR evaluation benchmark available.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the 29 chosen datasets together form a representative and non-redundant sample of all text-related visual challenges that large multimodal models will encounter in practice.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"OCRBench provides the largest evaluation suite yet for OCR capabilities in large multimodal models, revealing gaps in multilingual, handwritten, and mathematical text handling.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"OCRBench evaluates large multimodal models on 29 OCR datasets to expose their specific weaknesses in text recognition tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"428e0c0f10e965bedfa7a01d3926d3ce84a3be171f2bc2f6ae97c2cd344153c6"},"source":{"id":"2305.07895","kind":"arxiv","version":7},"verdict":{"id":"becea05d-6d4a-4be0-8f09-e1f7e34325c7","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T09:49:42.939904Z","strongest_claim":"To facilitate the assessment of Optical Character Recognition (OCR) capabilities in Large Multimodal Models, we propose OCRBench, a comprehensive evaluation benchmark. OCRBench contains 29 datasets, making it the most comprehensive OCR evaluation benchmark available.","one_line_summary":"OCRBench provides the largest evaluation suite yet for OCR capabilities in large multimodal models, revealing gaps in multilingual, handwritten, and mathematical text handling.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the 29 chosen datasets together form a representative and non-redundant sample of all text-related visual challenges that large multimodal models will encounter in practice.","pith_extraction_headline":"OCRBench evaluates large multimodal models on 29 OCR datasets to expose their specific weaknesses in text recognition tasks."},"references":{"count":122,"sample":[{"doi":"","year":2023,"title":"OpenAI. ChatGPT. https://openai.com/blog/chatgpt/, 2023","work_id":"07eb0a06-5091-41c0-b751-d00d8cff832a","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Gpt-4 technical report","work_id":"388f534c-855a-4366-b933-f07bf3e2db5f","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","ref_index":3,"cited_arxiv_id":"2302.13971","is_internal_anchor":true},{"doi":"","year":2023,"title":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. Stanford alpaca: An instruction-following llama model. https://github.co","work_id":"bf3517b5-0f2c-4f46-bff1-8d74220ebc3f","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","work_id":"67dc94e1-9c8e-4287-ae6c-979bce9614cf","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":122,"snapshot_sha256":"b48d398871d3ff6d4ec90f8f42409ca8255fc90f9ac0fe3016784e9780d9745e","internal_anchors":12},"formal_canon":{"evidence_count":2,"snapshot_sha256":"63fd31f74d6ea747e55fce5990745bf740d2afbe75834ef3379c0e8dd0753a60"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2305.07895","created_at":"2026-05-17T23:38:14.445961+00:00"},{"alias_kind":"arxiv_version","alias_value":"2305.07895v7","created_at":"2026-05-17T23:38:14.445961+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2305.07895","created_at":"2026-05-17T23:38:14.445961+00:00"},{"alias_kind":"pith_short_12","alias_value":"7YTNYMNUELMI","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"7YTNYMNUELMI4SR5","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"7YTNYMNU","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":29,"internal_anchor_count":29,"sample":[{"citing_arxiv_id":"2502.20295","citing_title":"Judge a Book by its Cover: Investigating Multi-Modal LLMs for Multi-Page Handwritten Document Transcription","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2503.23733","citing_title":"AdaMMS: Model Merging for Heterogeneous Multimodal Large Language Models with Unsupervised Coefficient Optimization","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12623","citing_title":"DocAtlas: Multilingual Document Understanding Across 80+ Languages","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15876","citing_title":"Unlocking Dense Metric Depth Estimation in VLMs","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18852","citing_title":"Robust Checkpoint Selection for Multimodal LLMs via Agentic Evaluation and Stability-Aware Ranking","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15876","citing_title":"Unlocking Dense Metric Depth Estimation in VLMs","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17159","citing_title":"MADP: A Multi-Agent Pipeline for Sustainable Document Processing with Human-in-the-Loop","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18359","citing_title":"RAVE: Re-Allocating Visual Attention in Large Multimodal Models","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20033","citing_title":"A Nash Equilibrium Framework For Training-Free Multimodal Step Verification","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2512.20856","citing_title":"NVIDIA Nemotron 3: Efficient and Open Intelligence","ref_index":85,"is_internal_anchor":true},{"citing_arxiv_id":"2501.01957","citing_title":"VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction","ref_index":68,"is_internal_anchor":true},{"citing_arxiv_id":"2501.00321","citing_title":"OCRBench v2: An Improved Benchmark for Evaluating Large Multimodal Models on Visual Text Localization and Reasoning","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2412.14164","citing_title":"MetaMorph: Multimodal Understanding and Generation via Instruction Tuning","ref_index":215,"is_internal_anchor":true},{"citing_arxiv_id":"2406.09411","citing_title":"MuirBench: A Comprehensive Benchmark for Robust Multi-image Understanding","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2406.16860","citing_title":"Cambrian-1: A Fully Open, Vision-Centric Exploration of Multimodal LLMs","ref_index":86,"is_internal_anchor":true},{"citing_arxiv_id":"2409.18839","citing_title":"MinerU: An Open-Source Solution for Precise Document Content Extraction","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2404.12390","citing_title":"BLINK: Multimodal Large Language Models Can See but Not Perceive","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2311.16502","citing_title":"MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12623","citing_title":"DocAtlas: Multilingual Document Understanding Across 80+ Languages","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13080","citing_title":"Learning to See What You Need: Gaze Attention for Multimodal Large Language Models","ref_index":69,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11301","citing_title":"LatentRouter: Can We Choose the Right Multimodal Model Before Seeing Its Answer?","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2404.16821","citing_title":"How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites","ref_index":67,"is_internal_anchor":true},{"citing_arxiv_id":"2409.18869","citing_title":"Emu3: Next-Token Prediction is All You Need","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2412.10302","citing_title":"DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2501.13106","citing_title":"VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding","ref_index":115,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7YTNYMNUELMI4SR5F3BAH7D5KB","json":"https://pith.science/pith/7YTNYMNUELMI4SR5F3BAH7D5KB.json","graph_json":"https://pith.science/api/pith-number/7YTNYMNUELMI4SR5F3BAH7D5KB/graph.json","events_json":"https://pith.science/api/pith-number/7YTNYMNUELMI4SR5F3BAH7D5KB/events.json","paper":"https://pith.science/paper/7YTNYMNU"},"agent_actions":{"view_html":"https://pith.science/pith/7YTNYMNUELMI4SR5F3BAH7D5KB","download_json":"https://pith.science/pith/7YTNYMNUELMI4SR5F3BAH7D5KB.json","view_paper":"https://pith.science/paper/7YTNYMNU","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2305.07895&json=true","fetch_graph":"https://pith.science/api/pith-number/7YTNYMNUELMI4SR5F3BAH7D5KB/graph.json","fetch_events":"https://pith.science/api/pith-number/7YTNYMNUELMI4SR5F3BAH7D5KB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7YTNYMNUELMI4SR5F3BAH7D5KB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7YTNYMNUELMI4SR5F3BAH7D5KB/action/storage_attestation","attest_author":"https://pith.science/pith/7YTNYMNUELMI4SR5F3BAH7D5KB/action/author_attestation","sign_citation":"https://pith.science/pith/7YTNYMNUELMI4SR5F3BAH7D5KB/action/citation_signature","submit_replication":"https://pith.science/pith/7YTNYMNUELMI4SR5F3BAH7D5KB/action/replication_record"}},"created_at":"2026-05-17T23:38:14.445961+00:00","updated_at":"2026-05-17T23:38:14.445961+00:00"}