{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:W6SUU52JXO5WYVMSTJ5FXKIXLD","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"19463fcc26754d19928e3026e2c322609af7ebc0165db5ba1af0a1b1300f97eb","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-03T12:21:26Z","title_canon_sha256":"d3378c8dcc208a8f25d0727da31c2752b7b1c4dfdb48d5e27fdb2f42bde6e5fd"},"schema_version":"1.0","source":{"id":"2602.03454","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.03454","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"arxiv_version","alias_value":"2602.03454v3","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.03454","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"pith_short_12","alias_value":"W6SUU52JXO5W","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"pith_short_16","alias_value":"W6SUU52JXO5WYVMS","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"pith_short_8","alias_value":"W6SUU52J","created_at":"2026-05-20T01:05:07Z"}],"graph_snapshots":[{"event_id":"sha256:8a13e1778e7cde56e9ae3626a95727313f638b6b9dc06b64b7c91fb77784ec7c","target":"graph","created_at":"2026-05-20T01:05:07Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"CoViP not only improves personalized image captioning but also yields holistic gains across downstream personalization tasks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the reinforcement-learning post-training and caption-augmented generation cause the model to genuinely use visual context rather than learned textual patterns, and that the introduced diagnostic evaluations fully rule out shortcut solutions."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"CoViP is a unified framework that improves vision-language models' personalized image captioning and downstream tasks through RL-based post-training while introducing diagnostics to confirm visual context usage."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"CoViP trains vision-language models to link new images to a user's past visual experiences for personalized responses."}],"snapshot_sha256":"a4348d3b834f3fef3ef0de36db6bc79147d0a2e6e7526f040c8da04b74e2f841"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"96410463a18b30a7f96e6177b944b5b5c467b0e1a340b2dc9228a5be3e89be3a"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2602.03454/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Despite recent progress in vision-language models (VLMs), existing approaches often fail to generate personalized responses based on the user's specific experiences, as they lack the ability to associate visual inputs with a user's accumulated visual-textual context. We newly formalize this challenge as contextualized visual personalization, which requires the visual recognition and textual retrieval of personalized visual experiences by VLMs when interpreting new images. To address this issue, we propose CoViP, a unified framework that treats personalized image captioning as a core task for c","authors_text":"Han Cheol Moon, Jisoo Mok, Junsung Park, Sangwon Yu, Sungroh Yoon, Yeongtak Oh","cross_cats":[],"headline":"CoViP trains vision-language models to link new images to a user's past visual experiences for personalized responses.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-03T12:21:26Z","title":"Contextualized Visual Personalization in Vision-Language Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.03454","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-16T08:18:23.596792Z","id":"bf892b24-60b8-47ae-b2e3-e6d62f43c515","model_set":{"reader":"grok-4.3"},"one_line_summary":"CoViP is a unified framework that improves vision-language models' personalized image captioning and downstream tasks through RL-based post-training while introducing diagnostics to confirm visual context usage.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"CoViP trains vision-language models to link new images to a user's past visual experiences for personalized responses.","strongest_claim":"CoViP not only improves personalized image captioning but also yields holistic gains across downstream personalization tasks.","weakest_assumption":"That the reinforcement-learning post-training and caption-augmented generation cause the model to genuinely use visual context rather than learned textual patterns, and that the introduced diagnostic evaluations fully rule out shortcut solutions."}},"verdict_id":"bf892b24-60b8-47ae-b2e3-e6d62f43c515"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:16802b568ecfa5a2c2706009c7ba71fc56c1242f9d926237152003e5b9e2614b","target":"record","created_at":"2026-05-20T01:05:07Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"19463fcc26754d19928e3026e2c322609af7ebc0165db5ba1af0a1b1300f97eb","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-03T12:21:26Z","title_canon_sha256":"d3378c8dcc208a8f25d0727da31c2752b7b1c4dfdb48d5e27fdb2f42bde6e5fd"},"schema_version":"1.0","source":{"id":"2602.03454","kind":"arxiv","version":3}},"canonical_sha256":"b7a54a7749bbbb6c55929a7a5ba91758d49647ed06f7f643c34811a8dace9833","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b7a54a7749bbbb6c55929a7a5ba91758d49647ed06f7f643c34811a8dace9833","first_computed_at":"2026-05-20T01:05:07.777211Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T01:05:07.777211Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"VD2fUXNvxZdiVeQSPAeVhfsP8nNT0/IjXRAwHwpKCuszdLtn6GeRtq4QqOlmIWX+b+dGFLW9Bdl8cND6dzqqDA==","signature_status":"signed_v1","signed_at":"2026-05-20T01:05:07.778267Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.03454","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:16802b568ecfa5a2c2706009c7ba71fc56c1242f9d926237152003e5b9e2614b","sha256:8a13e1778e7cde56e9ae3626a95727313f638b6b9dc06b64b7c91fb77784ec7c"],"state_sha256":"1e7c2baf3ba0c8771a11fa9c0a3478276f09254b36b6d9f9abf25590a80da65c"}