{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2016:F36ORSKRDJPGWBVUOLLAG42ZKI","short_pith_number":"pith:F36ORSKR","schema_version":"1.0","canonical_sha256":"2efce8c9511a5e6b06b472d60373595213208ae8e0b0fe25de35c1f0df815c9c","source":{"kind":"arxiv","id":"1601.07140","version":2},"attestation_state":"computed","paper":{"title":"COCO-Text: Dataset and Benchmark for Text Detection and Recognition in Natural Images","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Andreas Veit, Jiri Matas, Lukas Neumann, Serge Belongie, Tomas Matera","submitted_at":"2016-01-26T19:30:34Z","abstract_excerpt":"This paper describes the COCO-Text dataset. In recent years large-scale datasets like SUN and Imagenet drove the advancement of scene understanding and object recognition. The goal of COCO-Text is to advance state-of-the-art in text detection and recognition in natural images. The dataset is based on the MS COCO dataset, which contains images of complex everyday scenes. The images were not collected with text in mind and thus contain a broad variety of text instances. To reflect the diversity of text in natural scenes, we annotate text with (a) location in terms of a bounding box, (b) fine-gra"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1601.07140","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2016-01-26T19:30:34Z","cross_cats_sorted":[],"title_canon_sha256":"e7e70326acd08aeb3e803e2fe3a104625261658ac5c04166ef7f6f529bc16678","abstract_canon_sha256":"6b8c9b91f8af49a1822d53d58129f96f5d57465f9635f238805f79dcff317189"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:12:16.870762Z","signature_b64":"OTr3Xpaye7GxJ1lfUNdtJcSKp4/4ZBDefA9EOHA2H6aiem1uADXzvG60uOv82ntLzIBr7kMIparPAXK0Dq4sAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2efce8c9511a5e6b06b472d60373595213208ae8e0b0fe25de35c1f0df815c9c","last_reissued_at":"2026-05-18T01:12:16.870430Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:12:16.870430Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"COCO-Text: Dataset and Benchmark for Text Detection and Recognition in Natural Images","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Andreas Veit, Jiri Matas, Lukas Neumann, Serge Belongie, Tomas Matera","submitted_at":"2016-01-26T19:30:34Z","abstract_excerpt":"This paper describes the COCO-Text dataset. In recent years large-scale datasets like SUN and Imagenet drove the advancement of scene understanding and object recognition. The goal of COCO-Text is to advance state-of-the-art in text detection and recognition in natural images. The dataset is based on the MS COCO dataset, which contains images of complex everyday scenes. The images were not collected with text in mind and thus contain a broad variety of text instances. To reflect the diversity of text in natural scenes, we annotate text with (a) location in terms of a bounding box, (b) fine-gra"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1601.07140","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1601.07140","created_at":"2026-05-18T01:12:16.870485+00:00"},{"alias_kind":"arxiv_version","alias_value":"1601.07140v2","created_at":"2026-05-18T01:12:16.870485+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1601.07140","created_at":"2026-05-18T01:12:16.870485+00:00"},{"alias_kind":"pith_short_12","alias_value":"F36ORSKRDJPG","created_at":"2026-05-18T12:30:15.759754+00:00"},{"alias_kind":"pith_short_16","alias_value":"F36ORSKRDJPGWBVU","created_at":"2026-05-18T12:30:15.759754+00:00"},{"alias_kind":"pith_short_8","alias_value":"F36ORSKR","created_at":"2026-05-18T12:30:15.759754+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":16,"internal_anchor_count":11,"sample":[{"citing_arxiv_id":"1907.00490","citing_title":"ICDAR 2019 Competition on Scene Text Visual Question Answering","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"1907.00945","citing_title":"ICDAR2019 Robust Reading Challenge on Multi-lingual Scene Text Detection and Recognition -- RRC-MLT-2019","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"1907.06119","citing_title":"Understanding Deep Learning Techniques for Image Segmentation","ref_index":203,"is_internal_anchor":true},{"citing_arxiv_id":"2410.21169","citing_title":"Document Parsing Unveiled: Techniques, Challenges, and Prospects for Structured Information Extraction","ref_index":233,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17309","citing_title":"StyleText: A Large-Scale Dataset and Benchmark for Stylized Scene Text Inpainting","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2506.00721","citing_title":"Common Inpainted Objects In-N-Out of Context","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2506.09082","citing_title":"AVA-Bench: Atomic Visual Ability Benchmark for Vision Foundation Models","ref_index":93,"is_internal_anchor":true},{"citing_arxiv_id":"2512.20856","citing_title":"NVIDIA Nemotron 3: Efficient and Open Intelligence","ref_index":76,"is_internal_anchor":true},{"citing_arxiv_id":"2409.01704","citing_title":"General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2501.00321","citing_title":"OCRBench v2: An Improved Benchmark for Evaluating Large Multimodal Models on Visual Text Localization and Reasoning","ref_index":82,"is_internal_anchor":true},{"citing_arxiv_id":"2305.07895","citing_title":"OCRBench: On the Hidden Mystery of OCR in Large Multimodal Models","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2604.00161","citing_title":"Q-Mask: Query-driven Causal Masks for Text Anchoring in OCR-Oriented Vision-Language Models","ref_index":33,"is_internal_anchor":false},{"citing_arxiv_id":"2404.16821","citing_title":"How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites","ref_index":114,"is_internal_anchor":false},{"citing_arxiv_id":"2501.13106","citing_title":"VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding","ref_index":67,"is_internal_anchor":false},{"citing_arxiv_id":"2412.05271","citing_title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","ref_index":239,"is_internal_anchor":false},{"citing_arxiv_id":"2604.17941","citing_title":"From Heads to Neurons: Causal Attribution and Steering in Multi-Task Vision-Language Models","ref_index":102,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/F36ORSKRDJPGWBVUOLLAG42ZKI","json":"https://pith.science/pith/F36ORSKRDJPGWBVUOLLAG42ZKI.json","graph_json":"https://pith.science/api/pith-number/F36ORSKRDJPGWBVUOLLAG42ZKI/graph.json","events_json":"https://pith.science/api/pith-number/F36ORSKRDJPGWBVUOLLAG42ZKI/events.json","paper":"https://pith.science/paper/F36ORSKR"},"agent_actions":{"view_html":"https://pith.science/pith/F36ORSKRDJPGWBVUOLLAG42ZKI","download_json":"https://pith.science/pith/F36ORSKRDJPGWBVUOLLAG42ZKI.json","view_paper":"https://pith.science/paper/F36ORSKR","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1601.07140&json=true","fetch_graph":"https://pith.science/api/pith-number/F36ORSKRDJPGWBVUOLLAG42ZKI/graph.json","fetch_events":"https://pith.science/api/pith-number/F36ORSKRDJPGWBVUOLLAG42ZKI/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/F36ORSKRDJPGWBVUOLLAG42ZKI/action/timestamp_anchor","attest_storage":"https://pith.science/pith/F36ORSKRDJPGWBVUOLLAG42ZKI/action/storage_attestation","attest_author":"https://pith.science/pith/F36ORSKRDJPGWBVUOLLAG42ZKI/action/author_attestation","sign_citation":"https://pith.science/pith/F36ORSKRDJPGWBVUOLLAG42ZKI/action/citation_signature","submit_replication":"https://pith.science/pith/F36ORSKRDJPGWBVUOLLAG42ZKI/action/replication_record"}},"created_at":"2026-05-18T01:12:16.870485+00:00","updated_at":"2026-05-18T01:12:16.870485+00:00"}