{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:TMBSFGI7VZX25WWNHMID3LJE2U","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"85033f543c1f74ea344139b6dc5ad95a76f8869005c861a8351a8ccde22cfd92","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-30T01:19:18Z","title_canon_sha256":"23e268013b78d41c3e139223aff78373d89b3bb61f5ec16b8a9084aca12f96dd"},"schema_version":"1.0","source":{"id":"2405.00740","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2405.00740","created_at":"2026-07-05T10:41:12Z"},{"alias_kind":"arxiv_version","alias_value":"2405.00740v4","created_at":"2026-07-05T10:41:12Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2405.00740","created_at":"2026-07-05T10:41:12Z"},{"alias_kind":"pith_short_12","alias_value":"TMBSFGI7VZX2","created_at":"2026-07-05T10:41:12Z"},{"alias_kind":"pith_short_16","alias_value":"TMBSFGI7VZX25WWN","created_at":"2026-07-05T10:41:12Z"},{"alias_kind":"pith_short_8","alias_value":"TMBSFGI7","created_at":"2026-07-05T10:41:12Z"}],"graph_snapshots":[{"event_id":"sha256:59b1c4ec735e7637364e5390ba108f73f97eb5f21bf6064bd6a843f97af59e0c","target":"graph","created_at":"2026-07-05T10:41:12Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2405.00740/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"There are a thousand ways to caption an image. Contrastive Language Pretraining (CLIP) on the other hand, works by mapping an image and its caption to a single vector -- limiting how well CLIP-like models can represent the diverse ways to describe an image. In this work, we introduce Llip, Latent Language Image Pretraining, which models the diversity of captions that could match an image. Llip's vision encoder outputs a set of visual features that are mixed into a final representation by conditioning on information derived from the text. We show that Llip outperforms non-contextualized baselin","authors_text":"Aaron Courville, Andrew Gordon Wilson, Mahmoud Assran, Mark Ibrahim, Nicolas Ballas, Polina Kirichenko, Samuel Lavoie","cross_cats":["cs.AI","cs.CL","cs.LG"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-30T01:19:18Z","title":"Modeling Caption Diversity in Contrastive Vision-Language Pretraining"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2405.00740","kind":"arxiv","version":4},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:b07e45e94953d43075c00936d06becd77f924aa10a0ddf4eae9351034f1182ae","target":"record","created_at":"2026-07-05T10:41:12Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"85033f543c1f74ea344139b6dc5ad95a76f8869005c861a8351a8ccde22cfd92","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-30T01:19:18Z","title_canon_sha256":"23e268013b78d41c3e139223aff78373d89b3bb61f5ec16b8a9084aca12f96dd"},"schema_version":"1.0","source":{"id":"2405.00740","kind":"arxiv","version":4}},"canonical_sha256":"9b0322991fae6faedacd3b103dad24d51ea226817bf7d0bc7c6c52601c05f22c","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"9b0322991fae6faedacd3b103dad24d51ea226817bf7d0bc7c6c52601c05f22c","first_computed_at":"2026-07-05T10:41:12.525232Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-07-05T10:41:12.525232Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"ZQizrL2nBjy+NCkcjI9A3YF0GaQeO+cfy/Eq8cRg413bN6I7daAO477jmTbC9c3nnD8QhWZ+s4SLd6n1sH7ZAg==","signature_status":"signed_v1","signed_at":"2026-07-05T10:41:12.525733Z","signed_message":"canonical_sha256_bytes"},"source_id":"2405.00740","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:b07e45e94953d43075c00936d06becd77f924aa10a0ddf4eae9351034f1182ae","sha256:59b1c4ec735e7637364e5390ba108f73f97eb5f21bf6064bd6a843f97af59e0c"],"state_sha256":"2f181e0db0adb1c949cae1e3c6b7d0c5fb484df78ad4ad643bd54ee2263c122d"}