{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:6Q2D2HIVJG3LANUYNCGCEOUXR2","short_pith_number":"pith:6Q2D2HIV","schema_version":"1.0","canonical_sha256":"f4343d1d1549b6b03698688c223a978eb2c929889877af15ff9babde09005951","source":{"kind":"arxiv","id":"1906.07689","version":2},"attestation_state":"computed","paper":{"title":"Expressing Visual Relationships via Language","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Franck Dernoncourt, Hao Tan, Mohit Bansal, Trung Bui, Zhe Lin","submitted_at":"2019-06-18T17:01:21Z","abstract_excerpt":"Describing images with text is a fundamental problem in vision-language research. Current studies in this domain mostly focus on single image captioning. However, in various real applications (e.g., image editing, difference interpretation, and retrieval), generating relational captions for two images, can also be very useful. This important problem has not been explored mostly due to lack of datasets and effective models. To push forward the research in this direction, we first introduce a new language-guided image editing dataset that contains a large number of real image pairs with correspo"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1906.07689","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-06-18T17:01:21Z","cross_cats_sorted":["cs.CV","cs.LG"],"title_canon_sha256":"602e8d4669834bf8feb5ef724dfb03c77f498d2844b9f67e65ecaad2d768837e","abstract_canon_sha256":"946b17ce07df9ad09c7df8290a036f8b6c728f54111b19d13d742739f33ad672"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:42:57.931741Z","signature_b64":"EowiXj1zkdbFbx/CpFbWQJQfhtk1Ts00qbRdpDBmY4yCWOjn31/Fql03G0yH0CqHTkfQ1wHVMqiCLwOVU/JRDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f4343d1d1549b6b03698688c223a978eb2c929889877af15ff9babde09005951","last_reissued_at":"2026-05-17T23:42:57.931063Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:42:57.931063Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Expressing Visual Relationships via Language","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Franck Dernoncourt, Hao Tan, Mohit Bansal, Trung Bui, Zhe Lin","submitted_at":"2019-06-18T17:01:21Z","abstract_excerpt":"Describing images with text is a fundamental problem in vision-language research. Current studies in this domain mostly focus on single image captioning. However, in various real applications (e.g., image editing, difference interpretation, and retrieval), generating relational captions for two images, can also be very useful. This important problem has not been explored mostly due to lack of datasets and effective models. To push forward the research in this direction, we first introduce a new language-guided image editing dataset that contains a large number of real image pairs with correspo"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1906.07689","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1906.07689","created_at":"2026-05-17T23:42:57.931173+00:00"},{"alias_kind":"arxiv_version","alias_value":"1906.07689v2","created_at":"2026-05-17T23:42:57.931173+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1906.07689","created_at":"2026-05-17T23:42:57.931173+00:00"},{"alias_kind":"pith_short_12","alias_value":"6Q2D2HIVJG3L","created_at":"2026-05-18T12:33:10.108867+00:00"},{"alias_kind":"pith_short_16","alias_value":"6Q2D2HIVJG3LANUY","created_at":"2026-05-18T12:33:10.108867+00:00"},{"alias_kind":"pith_short_8","alias_value":"6Q2D2HIV","created_at":"2026-05-18T12:33:10.108867+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2505.17726","citing_title":"Slot-MLLM: Object-Centric Visual Tokenization for Multimodal LLM","ref_index":67,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/6Q2D2HIVJG3LANUYNCGCEOUXR2","json":"https://pith.science/pith/6Q2D2HIVJG3LANUYNCGCEOUXR2.json","graph_json":"https://pith.science/api/pith-number/6Q2D2HIVJG3LANUYNCGCEOUXR2/graph.json","events_json":"https://pith.science/api/pith-number/6Q2D2HIVJG3LANUYNCGCEOUXR2/events.json","paper":"https://pith.science/paper/6Q2D2HIV"},"agent_actions":{"view_html":"https://pith.science/pith/6Q2D2HIVJG3LANUYNCGCEOUXR2","download_json":"https://pith.science/pith/6Q2D2HIVJG3LANUYNCGCEOUXR2.json","view_paper":"https://pith.science/paper/6Q2D2HIV","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1906.07689&json=true","fetch_graph":"https://pith.science/api/pith-number/6Q2D2HIVJG3LANUYNCGCEOUXR2/graph.json","fetch_events":"https://pith.science/api/pith-number/6Q2D2HIVJG3LANUYNCGCEOUXR2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/6Q2D2HIVJG3LANUYNCGCEOUXR2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/6Q2D2HIVJG3LANUYNCGCEOUXR2/action/storage_attestation","attest_author":"https://pith.science/pith/6Q2D2HIVJG3LANUYNCGCEOUXR2/action/author_attestation","sign_citation":"https://pith.science/pith/6Q2D2HIVJG3LANUYNCGCEOUXR2/action/citation_signature","submit_replication":"https://pith.science/pith/6Q2D2HIVJG3LANUYNCGCEOUXR2/action/replication_record"}},"created_at":"2026-05-17T23:42:57.931173+00:00","updated_at":"2026-05-17T23:42:57.931173+00:00"}