{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:QQF26FK5N2TAJIT53SLTTDQCIB","short_pith_number":"pith:QQF26FK5","schema_version":"1.0","canonical_sha256":"840baf155d6ea604a27ddc97398e024053fd35e6dad798bf8ccd4f5268672dc7","source":{"kind":"arxiv","id":"1705.01359","version":1},"attestation_state":"computed","paper":{"title":"FOIL it! Find One mismatch between Image and Language caption","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.MM"],"primary_cat":"cs.CV","authors_text":"Aurelie Herbelot, Enver Sangineto, Moin Nabi, Raffaella Bernardi, Ravi Shekhar, Sandro Pezzelle, Yauhen Klimovich","submitted_at":"2017-05-03T11:07:13Z","abstract_excerpt":"In this paper, we aim to understand whether current language and vision (LaVi) models truly grasp the interaction between the two modalities. To this end, we propose an extension of the MSCOCO dataset, FOIL-COCO, which associates images with both correct and \"foil\" captions, that is, descriptions of the image that are highly similar to the original ones, but contain one single mistake (\"foil word\"). We show that current LaVi models fall into the traps of this data and perform badly on three tasks: a) caption classification (correct vs. foil); b) foil word detection; c) foil word correction. Hu"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1705.01359","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2017-05-03T11:07:13Z","cross_cats_sorted":["cs.CL","cs.MM"],"title_canon_sha256":"3ee0ebfdf5ac20b4017ab6dc3b8d9e1d4bf126c7480720fd866a6f8559985668","abstract_canon_sha256":"fbb30c5213f031a312e3e142217458bfee684c56ef10c08af3d47497deead6d3"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:39:06.536347Z","signature_b64":"Tb2KrptJ1HuEacA99/twdBLxCP1TdCV5ZlEfihnjJ0xrtnrlH+p2koibe/6/de+1AsVUyY5hZbA+IrBlWLasDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"840baf155d6ea604a27ddc97398e024053fd35e6dad798bf8ccd4f5268672dc7","last_reissued_at":"2026-05-18T00:39:06.535729Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:39:06.535729Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"FOIL it! Find One mismatch between Image and Language caption","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.MM"],"primary_cat":"cs.CV","authors_text":"Aurelie Herbelot, Enver Sangineto, Moin Nabi, Raffaella Bernardi, Ravi Shekhar, Sandro Pezzelle, Yauhen Klimovich","submitted_at":"2017-05-03T11:07:13Z","abstract_excerpt":"In this paper, we aim to understand whether current language and vision (LaVi) models truly grasp the interaction between the two modalities. To this end, we propose an extension of the MSCOCO dataset, FOIL-COCO, which associates images with both correct and \"foil\" captions, that is, descriptions of the image that are highly similar to the original ones, but contain one single mistake (\"foil word\"). We show that current LaVi models fall into the traps of this data and perform badly on three tasks: a) caption classification (correct vs. foil); b) foil word detection; c) foil word correction. Hu"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1705.01359","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1705.01359","created_at":"2026-05-18T00:39:06.535827+00:00"},{"alias_kind":"arxiv_version","alias_value":"1705.01359v1","created_at":"2026-05-18T00:39:06.535827+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1705.01359","created_at":"2026-05-18T00:39:06.535827+00:00"},{"alias_kind":"pith_short_12","alias_value":"QQF26FK5N2TA","created_at":"2026-05-18T12:31:39.905425+00:00"},{"alias_kind":"pith_short_16","alias_value":"QQF26FK5N2TAJIT5","created_at":"2026-05-18T12:31:39.905425+00:00"},{"alias_kind":"pith_short_8","alias_value":"QQF26FK5","created_at":"2026-05-18T12:31:39.905425+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/QQF26FK5N2TAJIT53SLTTDQCIB","json":"https://pith.science/pith/QQF26FK5N2TAJIT53SLTTDQCIB.json","graph_json":"https://pith.science/api/pith-number/QQF26FK5N2TAJIT53SLTTDQCIB/graph.json","events_json":"https://pith.science/api/pith-number/QQF26FK5N2TAJIT53SLTTDQCIB/events.json","paper":"https://pith.science/paper/QQF26FK5"},"agent_actions":{"view_html":"https://pith.science/pith/QQF26FK5N2TAJIT53SLTTDQCIB","download_json":"https://pith.science/pith/QQF26FK5N2TAJIT53SLTTDQCIB.json","view_paper":"https://pith.science/paper/QQF26FK5","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1705.01359&json=true","fetch_graph":"https://pith.science/api/pith-number/QQF26FK5N2TAJIT53SLTTDQCIB/graph.json","fetch_events":"https://pith.science/api/pith-number/QQF26FK5N2TAJIT53SLTTDQCIB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/QQF26FK5N2TAJIT53SLTTDQCIB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/QQF26FK5N2TAJIT53SLTTDQCIB/action/storage_attestation","attest_author":"https://pith.science/pith/QQF26FK5N2TAJIT53SLTTDQCIB/action/author_attestation","sign_citation":"https://pith.science/pith/QQF26FK5N2TAJIT53SLTTDQCIB/action/citation_signature","submit_replication":"https://pith.science/pith/QQF26FK5N2TAJIT53SLTTDQCIB/action/replication_record"}},"created_at":"2026-05-18T00:39:06.535827+00:00","updated_at":"2026-05-18T00:39:06.535827+00:00"}