{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:D6NE7KCKVEGFUPOX3JUSSHZY7F","short_pith_number":"pith:D6NE7KCK","schema_version":"1.0","canonical_sha256":"1f9a4fa84aa90c5a3dd7da69291f38f952075c082d26763b62d35eda51170f3f","source":{"kind":"arxiv","id":"1907.09340","version":1},"attestation_state":"computed","paper":{"title":"VIFIDEL: Evaluating the Visual Fidelity of Image Descriptions","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Josiah Wang, Lucia Specia, Pranava Madhyastha","submitted_at":"2019-07-22T14:33:43Z","abstract_excerpt":"We address the task of evaluating image description generation systems. We propose a novel image-aware metric for this task: VIFIDEL. It estimates the faithfulness of a generated caption with respect to the content of the actual image, based on the semantic similarity between labels of objects depicted in images and words in the description. The metric is also able to take into account the relative importance of objects mentioned in human reference descriptions during evaluation. Even if these human reference descriptions are not available, VIFIDEL can still reliably evaluate system descriptio"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1907.09340","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2019-07-22T14:33:43Z","cross_cats_sorted":["cs.CV","cs.LG"],"title_canon_sha256":"cb3eba8d08dafed54883b2888268a81ac4e45c3de02c6c1ded10e9c7b1741e3f","abstract_canon_sha256":"6da79b001aa5ba309c04f63790394f1fe4e1f5ad93c88e472889064e0b3f7ff1"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:59.443483Z","signature_b64":"GAeJiEX8CXzaGo0dcL7Ba0yZBviKHAmVu9ULUJJwqrBlCtE66ONwN75vDrDC9ZaLr8wi1vdkoFYft5qD/ZWDDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"1f9a4fa84aa90c5a3dd7da69291f38f952075c082d26763b62d35eda51170f3f","last_reissued_at":"2026-05-17T23:39:59.443092Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:59.443092Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"VIFIDEL: Evaluating the Visual Fidelity of Image Descriptions","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Josiah Wang, Lucia Specia, Pranava Madhyastha","submitted_at":"2019-07-22T14:33:43Z","abstract_excerpt":"We address the task of evaluating image description generation systems. We propose a novel image-aware metric for this task: VIFIDEL. It estimates the faithfulness of a generated caption with respect to the content of the actual image, based on the semantic similarity between labels of objects depicted in images and words in the description. The metric is also able to take into account the relative importance of objects mentioned in human reference descriptions during evaluation. Even if these human reference descriptions are not available, VIFIDEL can still reliably evaluate system descriptio"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1907.09340","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1907.09340","created_at":"2026-05-17T23:39:59.443144+00:00"},{"alias_kind":"arxiv_version","alias_value":"1907.09340v1","created_at":"2026-05-17T23:39:59.443144+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1907.09340","created_at":"2026-05-17T23:39:59.443144+00:00"},{"alias_kind":"pith_short_12","alias_value":"D6NE7KCKVEGF","created_at":"2026-05-18T12:33:15.570797+00:00"},{"alias_kind":"pith_short_16","alias_value":"D6NE7KCKVEGFUPOX","created_at":"2026-05-18T12:33:15.570797+00:00"},{"alias_kind":"pith_short_8","alias_value":"D6NE7KCK","created_at":"2026-05-18T12:33:15.570797+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2509.16538","citing_title":"VC-Inspector: Advancing Reference-free Evaluation of Video Captions with Factual Analysis","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2302.12192","citing_title":"Aligning Text-to-Image Models using Human Feedback","ref_index":12,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/D6NE7KCKVEGFUPOX3JUSSHZY7F","json":"https://pith.science/pith/D6NE7KCKVEGFUPOX3JUSSHZY7F.json","graph_json":"https://pith.science/api/pith-number/D6NE7KCKVEGFUPOX3JUSSHZY7F/graph.json","events_json":"https://pith.science/api/pith-number/D6NE7KCKVEGFUPOX3JUSSHZY7F/events.json","paper":"https://pith.science/paper/D6NE7KCK"},"agent_actions":{"view_html":"https://pith.science/pith/D6NE7KCKVEGFUPOX3JUSSHZY7F","download_json":"https://pith.science/pith/D6NE7KCKVEGFUPOX3JUSSHZY7F.json","view_paper":"https://pith.science/paper/D6NE7KCK","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1907.09340&json=true","fetch_graph":"https://pith.science/api/pith-number/D6NE7KCKVEGFUPOX3JUSSHZY7F/graph.json","fetch_events":"https://pith.science/api/pith-number/D6NE7KCKVEGFUPOX3JUSSHZY7F/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/D6NE7KCKVEGFUPOX3JUSSHZY7F/action/timestamp_anchor","attest_storage":"https://pith.science/pith/D6NE7KCKVEGFUPOX3JUSSHZY7F/action/storage_attestation","attest_author":"https://pith.science/pith/D6NE7KCKVEGFUPOX3JUSSHZY7F/action/author_attestation","sign_citation":"https://pith.science/pith/D6NE7KCKVEGFUPOX3JUSSHZY7F/action/citation_signature","submit_replication":"https://pith.science/pith/D6NE7KCKVEGFUPOX3JUSSHZY7F/action/replication_record"}},"created_at":"2026-05-17T23:39:59.443144+00:00","updated_at":"2026-05-17T23:39:59.443144+00:00"}