{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:DD6HZH6AMPUXV2Z2Y4KMMCRQZO","short_pith_number":"pith:DD6HZH6A","schema_version":"1.0","canonical_sha256":"18fc7c9fc063e97aeb3ac714c60a30cb80fcf5b6c5964547c3351a42ed200af5","source":{"kind":"arxiv","id":"1904.08920","version":2},"attestation_state":"computed","paper":{"title":"Towards VQA Models That Can Read","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Amanpreet Singh, Devi Parikh, Dhruv Batra, Marcus Rohrbach, Meet Shah, Vivek Natarajan, Xinlei Chen, Yu Jiang","submitted_at":"2019-04-18T17:55:37Z","abstract_excerpt":"Studies have shown that a dominant class of questions asked by visually impaired users on images of their surroundings involves reading text in the image. But today's VQA models can not read! Our paper takes a first step towards addressing this problem. First, we introduce a new \"TextVQA\" dataset to facilitate progress on this important problem. Existing datasets either have a small proportion of questions about text (e.g., the VQA dataset) or are too small (e.g., the VizWiz dataset). TextVQA contains 45,336 questions on 28,408 images that require reasoning about text to answer. Second, we int"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1904.08920","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-04-18T17:55:37Z","cross_cats_sorted":["cs.CV","cs.LG"],"title_canon_sha256":"44ce10db791852ad63110af69ac2c851fd7da4782cf6eb2362ce97663462f27d","abstract_canon_sha256":"059346af8afa3cf5e6b3cd1f1d5427e96a8fdaae1c43761409fdf58d44ff2db3"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:46:17.703673Z","signature_b64":"WdvLrLUbo5hZwIxMsgPEn7j33GvpyVW/UNh+3JjYkyxx21aMzjuU6NnfAHycDMpS84srTB0fRF5sqw9EkH6HBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"18fc7c9fc063e97aeb3ac714c60a30cb80fcf5b6c5964547c3351a42ed200af5","last_reissued_at":"2026-05-17T23:46:17.703024Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:46:17.703024Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Towards VQA Models That Can Read","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Amanpreet Singh, Devi Parikh, Dhruv Batra, Marcus Rohrbach, Meet Shah, Vivek Natarajan, Xinlei Chen, Yu Jiang","submitted_at":"2019-04-18T17:55:37Z","abstract_excerpt":"Studies have shown that a dominant class of questions asked by visually impaired users on images of their surroundings involves reading text in the image. But today's VQA models can not read! Our paper takes a first step towards addressing this problem. First, we introduce a new \"TextVQA\" dataset to facilitate progress on this important problem. Existing datasets either have a small proportion of questions about text (e.g., the VQA dataset) or are too small (e.g., the VizWiz dataset). TextVQA contains 45,336 questions on 28,408 images that require reasoning about text to answer. Second, we int"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1904.08920","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1904.08920","created_at":"2026-05-17T23:46:17.703123+00:00"},{"alias_kind":"arxiv_version","alias_value":"1904.08920v2","created_at":"2026-05-17T23:46:17.703123+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1904.08920","created_at":"2026-05-17T23:46:17.703123+00:00"},{"alias_kind":"pith_short_12","alias_value":"DD6HZH6AMPUX","created_at":"2026-05-18T12:33:15.570797+00:00"},{"alias_kind":"pith_short_16","alias_value":"DD6HZH6AMPUXV2Z2","created_at":"2026-05-18T12:33:15.570797+00:00"},{"alias_kind":"pith_short_8","alias_value":"DD6HZH6A","created_at":"2026-05-18T12:33:15.570797+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"1907.00490","citing_title":"ICDAR 2019 Competition on Scene Text Visual Question Answering","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2406.11354","citing_title":"Preserving Knowledge in Large Language Model with Model-Agnostic Self-Decompression","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2508.06038","citing_title":"Fourier Compressor: Frequency-Domain Visual Token Compression for Vision-Language Models","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2604.12358","citing_title":"Why and When Visual Token Pruning Fails? A Study on Relevant Visual Information Shift in MLLMs Decoding","ref_index":37,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/DD6HZH6AMPUXV2Z2Y4KMMCRQZO","json":"https://pith.science/pith/DD6HZH6AMPUXV2Z2Y4KMMCRQZO.json","graph_json":"https://pith.science/api/pith-number/DD6HZH6AMPUXV2Z2Y4KMMCRQZO/graph.json","events_json":"https://pith.science/api/pith-number/DD6HZH6AMPUXV2Z2Y4KMMCRQZO/events.json","paper":"https://pith.science/paper/DD6HZH6A"},"agent_actions":{"view_html":"https://pith.science/pith/DD6HZH6AMPUXV2Z2Y4KMMCRQZO","download_json":"https://pith.science/pith/DD6HZH6AMPUXV2Z2Y4KMMCRQZO.json","view_paper":"https://pith.science/paper/DD6HZH6A","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1904.08920&json=true","fetch_graph":"https://pith.science/api/pith-number/DD6HZH6AMPUXV2Z2Y4KMMCRQZO/graph.json","fetch_events":"https://pith.science/api/pith-number/DD6HZH6AMPUXV2Z2Y4KMMCRQZO/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/DD6HZH6AMPUXV2Z2Y4KMMCRQZO/action/timestamp_anchor","attest_storage":"https://pith.science/pith/DD6HZH6AMPUXV2Z2Y4KMMCRQZO/action/storage_attestation","attest_author":"https://pith.science/pith/DD6HZH6AMPUXV2Z2Y4KMMCRQZO/action/author_attestation","sign_citation":"https://pith.science/pith/DD6HZH6AMPUXV2Z2Y4KMMCRQZO/action/citation_signature","submit_replication":"https://pith.science/pith/DD6HZH6AMPUXV2Z2Y4KMMCRQZO/action/replication_record"}},"created_at":"2026-05-17T23:46:17.703123+00:00","updated_at":"2026-05-17T23:46:17.703123+00:00"}