{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:H3OHMD7GTFTGANEC2ESBEIMACB","short_pith_number":"pith:H3OHMD7G","schema_version":"1.0","canonical_sha256":"3edc760fe69966603482d124122180104e175a53f20228c085461771ec52c57e","source":{"kind":"arxiv","id":"1902.09506","version":3},"attestation_state":"computed","paper":{"title":"GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Christopher D. Manning, Drew A. Hudson","submitted_at":"2019-02-25T18:37:49Z","abstract_excerpt":"We introduce GQA, a new dataset for real-world visual reasoning and compositional question answering, seeking to address key shortcomings of previous VQA datasets. We have developed a strong and robust question engine that leverages scene graph structures to create 22M diverse reasoning questions, all come with functional programs that represent their semantics. We use the programs to gain tight control over the answer distribution and present a new tunable smoothing technique to mitigate question biases. Accompanying the dataset is a suite of new metrics that evaluate essential qualities such"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1902.09506","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2019-02-25T18:37:49Z","cross_cats_sorted":["cs.AI","cs.CV","cs.LG"],"title_canon_sha256":"5ce51df1ed24bb5a4b744742a5c9b2260cb6e057e08339c98a4fd65e3ac60cab","abstract_canon_sha256":"087baf2e8e49b4afaad86d7ad416454d11967481e008c18a9ab3f70f3b3d972f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:40:55.386848Z","signature_b64":"MBZd2EeTODzuXQ1JSP3b+3hXwwVdPFnm6IUgFqXHK2zBX+jtXtB4apa2Bv+ZDlabqu2gotK3jhH5103/wZeWCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"3edc760fe69966603482d124122180104e175a53f20228c085461771ec52c57e","last_reissued_at":"2026-05-17T23:40:55.386119Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:40:55.386119Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Christopher D. Manning, Drew A. Hudson","submitted_at":"2019-02-25T18:37:49Z","abstract_excerpt":"We introduce GQA, a new dataset for real-world visual reasoning and compositional question answering, seeking to address key shortcomings of previous VQA datasets. We have developed a strong and robust question engine that leverages scene graph structures to create 22M diverse reasoning questions, all come with functional programs that represent their semantics. We use the programs to gain tight control over the answer distribution and present a new tunable smoothing technique to mitigate question biases. Accompanying the dataset is a suite of new metrics that evaluate essential qualities such"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1902.09506","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1902.09506","created_at":"2026-05-17T23:40:55.386237+00:00"},{"alias_kind":"arxiv_version","alias_value":"1902.09506v3","created_at":"2026-05-17T23:40:55.386237+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1902.09506","created_at":"2026-05-17T23:40:55.386237+00:00"},{"alias_kind":"pith_short_12","alias_value":"H3OHMD7GTFTG","created_at":"2026-05-18T12:33:18.533446+00:00"},{"alias_kind":"pith_short_16","alias_value":"H3OHMD7GTFTGANEC","created_at":"2026-05-18T12:33:18.533446+00:00"},{"alias_kind":"pith_short_8","alias_value":"H3OHMD7G","created_at":"2026-05-18T12:33:18.533446+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":12,"internal_anchor_count":7,"sample":[{"citing_arxiv_id":"2605.23883","citing_title":"PGT: Procedurally Generated Tasks for improving visual grounding in MLLMs","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2406.11354","citing_title":"Preserving Knowledge in Large Language Model with Model-Agnostic Self-Decompression","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2408.10872","citing_title":"V-RoAst: Visual Road Assessment. Can VLM be a Road Safety Assessor Using the iRAP Standard?","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2508.06038","citing_title":"Fourier Compressor: Frequency-Domain Visual Token Compression for Vision-Language Models","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17187","citing_title":"PluRule: A Benchmark for Moderating Pluralistic Communities on Social Media","ref_index":201,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20177","citing_title":"From Seeing to Thinking: Decoupling Perception and Reasoning Improves Post-Training of Vision-Language Models","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2303.08128","citing_title":"ViperGPT: Visual Inference via Python Execution for Reasoning","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23788","citing_title":"MIRAGE: A Micro-Interaction Relational Architecture for Grounded Exploration in Multi-Figure Artworks","ref_index":15,"is_internal_anchor":false},{"citing_arxiv_id":"2407.07726","citing_title":"PaliGemma: A versatile 3B VLM for transfer","ref_index":54,"is_internal_anchor":false},{"citing_arxiv_id":"2604.16930","citing_title":"CoGR-MoE: Concept-Guided Expert Routing with Consistent Selection and Flexible Reasoning for Visual Question Answering","ref_index":14,"is_internal_anchor":false},{"citing_arxiv_id":"2604.17488","citing_title":"AutoVQA-G: Self-Improving Agentic Framework for Automated Visual Question Answering and Grounding Annotation","ref_index":13,"is_internal_anchor":false},{"citing_arxiv_id":"2604.23099","citing_title":"ProEval: Proactive Failure Discovery and Efficient Performance Estimation for Generative AI Evaluation","ref_index":29,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/H3OHMD7GTFTGANEC2ESBEIMACB","json":"https://pith.science/pith/H3OHMD7GTFTGANEC2ESBEIMACB.json","graph_json":"https://pith.science/api/pith-number/H3OHMD7GTFTGANEC2ESBEIMACB/graph.json","events_json":"https://pith.science/api/pith-number/H3OHMD7GTFTGANEC2ESBEIMACB/events.json","paper":"https://pith.science/paper/H3OHMD7G"},"agent_actions":{"view_html":"https://pith.science/pith/H3OHMD7GTFTGANEC2ESBEIMACB","download_json":"https://pith.science/pith/H3OHMD7GTFTGANEC2ESBEIMACB.json","view_paper":"https://pith.science/paper/H3OHMD7G","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1902.09506&json=true","fetch_graph":"https://pith.science/api/pith-number/H3OHMD7GTFTGANEC2ESBEIMACB/graph.json","fetch_events":"https://pith.science/api/pith-number/H3OHMD7GTFTGANEC2ESBEIMACB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/H3OHMD7GTFTGANEC2ESBEIMACB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/H3OHMD7GTFTGANEC2ESBEIMACB/action/storage_attestation","attest_author":"https://pith.science/pith/H3OHMD7GTFTGANEC2ESBEIMACB/action/author_attestation","sign_citation":"https://pith.science/pith/H3OHMD7GTFTGANEC2ESBEIMACB/action/citation_signature","submit_replication":"https://pith.science/pith/H3OHMD7GTFTGANEC2ESBEIMACB/action/replication_record"}},"created_at":"2026-05-17T23:40:55.386237+00:00","updated_at":"2026-05-17T23:40:55.386237+00:00"}