{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:YRRXP247IL3VIYX3Y7M7NRTX5X","short_pith_number":"pith:YRRXP247","schema_version":"1.0","canonical_sha256":"c46377eb9f42f75462fbc7d9f6c677ede5f6986f373c157f0fde1946f82da4c7","source":{"kind":"arxiv","id":"1811.00491","version":3},"attestation_state":"computed","paper":{"title":"A Corpus for Reasoning About Natural Language Grounded in Photographs","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.CL","authors_text":"Alane Suhr, Ally Zhang, Huajun Bai, Iris Zhang, Stephanie Zhou, Yoav Artzi","submitted_at":"2018-11-01T16:47:44Z","abstract_excerpt":"We introduce a new dataset for joint reasoning about natural language and images, with a focus on semantic diversity, compositionality, and visual reasoning challenges. The data contains 107,292 examples of English sentences paired with web photographs. The task is to determine whether a natural language caption is true about a pair of photographs. We crowdsource the data using sets of visually rich images and a compare-and-contrast task to elicit linguistically diverse language. Qualitative analysis shows the data requires compositional joint reasoning, including about quantities, comparisons"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1811.00491","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-11-01T16:47:44Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"5d37392b07467cc28c6690ac1d4eb0803e2c7b3bb60610d4bbe098411858e542","abstract_canon_sha256":"463e51249542a2a7056d4a886d55a62d7130fc395c274abcaaa8cd49f89b9071"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:40:04.139401Z","signature_b64":"OiHUakMSDA4O/c9o6tIJpHc2itH5V2t9n4oAIoYhqVF4uPFoN0JWgXhCT0ygV9YMic3wQ9LJ1QK/jCbv3eRTAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c46377eb9f42f75462fbc7d9f6c677ede5f6986f373c157f0fde1946f82da4c7","last_reissued_at":"2026-05-17T23:40:04.138666Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:40:04.138666Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"A Corpus for Reasoning About Natural Language Grounded in Photographs","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.CL","authors_text":"Alane Suhr, Ally Zhang, Huajun Bai, Iris Zhang, Stephanie Zhou, Yoav Artzi","submitted_at":"2018-11-01T16:47:44Z","abstract_excerpt":"We introduce a new dataset for joint reasoning about natural language and images, with a focus on semantic diversity, compositionality, and visual reasoning challenges. The data contains 107,292 examples of English sentences paired with web photographs. The task is to determine whether a natural language caption is true about a pair of photographs. We crowdsource the data using sets of visually rich images and a compare-and-contrast task to elicit linguistically diverse language. Qualitative analysis shows the data requires compositional joint reasoning, including about quantities, comparisons"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1811.00491","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1811.00491","created_at":"2026-05-17T23:40:04.138763+00:00"},{"alias_kind":"arxiv_version","alias_value":"1811.00491v3","created_at":"2026-05-17T23:40:04.138763+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1811.00491","created_at":"2026-05-17T23:40:04.138763+00:00"},{"alias_kind":"pith_short_12","alias_value":"YRRXP247IL3V","created_at":"2026-05-18T12:33:04.347982+00:00"},{"alias_kind":"pith_short_16","alias_value":"YRRXP247IL3VIYX3","created_at":"2026-05-18T12:33:04.347982+00:00"},{"alias_kind":"pith_short_8","alias_value":"YRRXP247","created_at":"2026-05-18T12:33:04.347982+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2408.04840","citing_title":"mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal Large Language Models","ref_index":243,"is_internal_anchor":true},{"citing_arxiv_id":"2205.01917","citing_title":"CoCa: Contrastive Captioners are Image-Text Foundation Models","ref_index":77,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18452","citing_title":"ESsEN: Training Compact Discriminative Vision-Language Transformers in a Low-Resource Setting","ref_index":42,"is_internal_anchor":false},{"citing_arxiv_id":"2407.07895","citing_title":"LLaVA-NeXT-Interleave: Tackling Multi-image, Video, and 3D in Large Multimodal Models","ref_index":50,"is_internal_anchor":false},{"citing_arxiv_id":"2604.05583","citing_title":"WRF4CIR: Weight-Regularized Fine-Tuning Network for Composed Image Retrieval","ref_index":56,"is_internal_anchor":false},{"citing_arxiv_id":"2412.05271","citing_title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","ref_index":217,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YRRXP247IL3VIYX3Y7M7NRTX5X","json":"https://pith.science/pith/YRRXP247IL3VIYX3Y7M7NRTX5X.json","graph_json":"https://pith.science/api/pith-number/YRRXP247IL3VIYX3Y7M7NRTX5X/graph.json","events_json":"https://pith.science/api/pith-number/YRRXP247IL3VIYX3Y7M7NRTX5X/events.json","paper":"https://pith.science/paper/YRRXP247"},"agent_actions":{"view_html":"https://pith.science/pith/YRRXP247IL3VIYX3Y7M7NRTX5X","download_json":"https://pith.science/pith/YRRXP247IL3VIYX3Y7M7NRTX5X.json","view_paper":"https://pith.science/paper/YRRXP247","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1811.00491&json=true","fetch_graph":"https://pith.science/api/pith-number/YRRXP247IL3VIYX3Y7M7NRTX5X/graph.json","fetch_events":"https://pith.science/api/pith-number/YRRXP247IL3VIYX3Y7M7NRTX5X/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YRRXP247IL3VIYX3Y7M7NRTX5X/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YRRXP247IL3VIYX3Y7M7NRTX5X/action/storage_attestation","attest_author":"https://pith.science/pith/YRRXP247IL3VIYX3Y7M7NRTX5X/action/author_attestation","sign_citation":"https://pith.science/pith/YRRXP247IL3VIYX3Y7M7NRTX5X/action/citation_signature","submit_replication":"https://pith.science/pith/YRRXP247IL3VIYX3Y7M7NRTX5X/action/replication_record"}},"created_at":"2026-05-17T23:40:04.138763+00:00","updated_at":"2026-05-17T23:40:04.138763+00:00"}