{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2015:LNR5WQRRVUIBFL76SX4GYL6KXB","short_pith_number":"pith:LNR5WQRR","schema_version":"1.0","canonical_sha256":"5b63db4231ad1012affe95f86c2fcab84b6e70001cbd89adee64d7f483abf03c","source":{"kind":"arxiv","id":"1511.03292","version":1},"attestation_state":"computed","paper":{"title":"From Images to Sentences through Scene Description Graphs using Commonsense Reasoning and Knowledge","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.CV","authors_text":"Chitta Baral, Cornelia Fermuller, Somak Aditya, Yezhou Yang, Yiannis Aloimonos","submitted_at":"2015-11-10T21:14:51Z","abstract_excerpt":"In this paper we propose the construction of linguistic descriptions of images. This is achieved through the extraction of scene description graphs (SDGs) from visual scenes using an automatically constructed knowledge base. SDGs are constructed using both vision and reasoning. Specifically, commonsense reasoning is applied on (a) detections obtained from existing perception methods on given images, (b) a \"commonsense\" knowledge base constructed using natural language processing of image annotations and (c) lexical ontological knowledge from resources such as WordNet. Amazon Mechanical Turk(AM"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1511.03292","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2015-11-10T21:14:51Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"64bc5d6f46ea656b11e7665b910110aef9fb8ee0a5e22ad13e53b716ec477d89","abstract_canon_sha256":"cd7d2190e2d267bd61afb630fa6f60ec47aa065482d330b9f0528f68b2232e9f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:27:15.838362Z","signature_b64":"obz/WqGNgFcuEbgrqNDbkw17AHtcnuUkbJI21bOo2YWUSRXuKEx2WkvCXkMavaq4AAKUDDf+aCVkM8MNk3/DBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5b63db4231ad1012affe95f86c2fcab84b6e70001cbd89adee64d7f483abf03c","last_reissued_at":"2026-05-18T01:27:15.837655Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:27:15.837655Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"From Images to Sentences through Scene Description Graphs using Commonsense Reasoning and Knowledge","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.CV","authors_text":"Chitta Baral, Cornelia Fermuller, Somak Aditya, Yezhou Yang, Yiannis Aloimonos","submitted_at":"2015-11-10T21:14:51Z","abstract_excerpt":"In this paper we propose the construction of linguistic descriptions of images. This is achieved through the extraction of scene description graphs (SDGs) from visual scenes using an automatically constructed knowledge base. SDGs are constructed using both vision and reasoning. Specifically, commonsense reasoning is applied on (a) detections obtained from existing perception methods on given images, (b) a \"commonsense\" knowledge base constructed using natural language processing of image annotations and (c) lexical ontological knowledge from resources such as WordNet. Amazon Mechanical Turk(AM"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1511.03292","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1511.03292","created_at":"2026-05-18T01:27:15.837756+00:00"},{"alias_kind":"arxiv_version","alias_value":"1511.03292v1","created_at":"2026-05-18T01:27:15.837756+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1511.03292","created_at":"2026-05-18T01:27:15.837756+00:00"},{"alias_kind":"pith_short_12","alias_value":"LNR5WQRRVUIB","created_at":"2026-05-18T12:29:29.992203+00:00"},{"alias_kind":"pith_short_16","alias_value":"LNR5WQRRVUIBFL76","created_at":"2026-05-18T12:29:29.992203+00:00"},{"alias_kind":"pith_short_8","alias_value":"LNR5WQRR","created_at":"2026-05-18T12:29:29.992203+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":5,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.21728","citing_title":"BEiTScore: Reference-free Image Captioning Evaluation with an Efficient Cross-Encoder Model","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2604.03765","citing_title":"ITIScore: An Image-to-Text-to-Image Rating Framework for the Image Captioning Ability of MLLMs","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2104.08718","citing_title":"CLIPScore: A Reference-free Evaluation Metric for Image Captioning","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2605.06080","citing_title":"MSD-Score: Multi-Scale Distributional Scoring for Reference-Free Image Caption Evaluation","ref_index":37,"is_internal_anchor":false},{"citing_arxiv_id":"2604.11589","citing_title":"MLLM-as-a-Judge Exhibits Model Preference Bias","ref_index":3,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/LNR5WQRRVUIBFL76SX4GYL6KXB","json":"https://pith.science/pith/LNR5WQRRVUIBFL76SX4GYL6KXB.json","graph_json":"https://pith.science/api/pith-number/LNR5WQRRVUIBFL76SX4GYL6KXB/graph.json","events_json":"https://pith.science/api/pith-number/LNR5WQRRVUIBFL76SX4GYL6KXB/events.json","paper":"https://pith.science/paper/LNR5WQRR"},"agent_actions":{"view_html":"https://pith.science/pith/LNR5WQRRVUIBFL76SX4GYL6KXB","download_json":"https://pith.science/pith/LNR5WQRRVUIBFL76SX4GYL6KXB.json","view_paper":"https://pith.science/paper/LNR5WQRR","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1511.03292&json=true","fetch_graph":"https://pith.science/api/pith-number/LNR5WQRRVUIBFL76SX4GYL6KXB/graph.json","fetch_events":"https://pith.science/api/pith-number/LNR5WQRRVUIBFL76SX4GYL6KXB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/LNR5WQRRVUIBFL76SX4GYL6KXB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/LNR5WQRRVUIBFL76SX4GYL6KXB/action/storage_attestation","attest_author":"https://pith.science/pith/LNR5WQRRVUIBFL76SX4GYL6KXB/action/author_attestation","sign_citation":"https://pith.science/pith/LNR5WQRRVUIBFL76SX4GYL6KXB/action/citation_signature","submit_replication":"https://pith.science/pith/LNR5WQRRVUIBFL76SX4GYL6KXB/action/replication_record"}},"created_at":"2026-05-18T01:27:15.837756+00:00","updated_at":"2026-05-18T01:27:15.837756+00:00"}