{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:OXFILZ3XVNGJMGUP77ZFC3TAUJ","short_pith_number":"pith:OXFILZ3X","canonical_record":{"source":{"id":"2602.01851","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-02T09:24:45Z","cross_cats_sorted":[],"title_canon_sha256":"bc044520ec797e2a23053797128c318ddbce64491b1abbbc15cb080783d63c34","abstract_canon_sha256":"44b30b69f9237bbec3a0530d1e174a09de25282fa30b61140f9039433b4ac0ab"},"schema_version":"1.0"},"canonical_sha256":"75ca85e777ab4c961a8ffff2516e60a27e1c01fed4880f442ff36ceddccf033d","source":{"kind":"arxiv","id":"2602.01851","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.01851","created_at":"2026-05-22T01:03:55Z"},{"alias_kind":"arxiv_version","alias_value":"2602.01851v2","created_at":"2026-05-22T01:03:55Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.01851","created_at":"2026-05-22T01:03:55Z"},{"alias_kind":"pith_short_12","alias_value":"OXFILZ3XVNGJ","created_at":"2026-05-22T01:03:55Z"},{"alias_kind":"pith_short_16","alias_value":"OXFILZ3XVNGJMGUP","created_at":"2026-05-22T01:03:55Z"},{"alias_kind":"pith_short_8","alias_value":"OXFILZ3X","created_at":"2026-05-22T01:03:55Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:OXFILZ3XVNGJMGUP77ZFC3TAUJ","target":"record","payload":{"canonical_record":{"source":{"id":"2602.01851","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-02T09:24:45Z","cross_cats_sorted":[],"title_canon_sha256":"bc044520ec797e2a23053797128c318ddbce64491b1abbbc15cb080783d63c34","abstract_canon_sha256":"44b30b69f9237bbec3a0530d1e174a09de25282fa30b61140f9039433b4ac0ab"},"schema_version":"1.0"},"canonical_sha256":"75ca85e777ab4c961a8ffff2516e60a27e1c01fed4880f442ff36ceddccf033d","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T01:03:55.827787Z","signature_b64":"RB8XUbJlx7yrRf824A4moV8YSW5gCWWGUjh18k/NLz60MfcZVyW3Wwo0MXlU9iB5wAfnS+mHA8E3Cyjqhsx0BQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"75ca85e777ab4c961a8ffff2516e60a27e1c01fed4880f442ff36ceddccf033d","last_reissued_at":"2026-05-22T01:03:55.826794Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T01:03:55.826794Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.01851","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-22T01:03:55Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"PGtxuae1B3zaZ6wfBX0IEjrRDajDKYcEmuWDhY50ifquv0bQfSzjmR2rsrtnrd90/Wt5oMEtEC0359LX7br/Cw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T08:35:21.385472Z"},"content_sha256":"bcdb30941a99bbff48a492169a0c2170792b667b565444822d8b4a3ceac1176f","schema_version":"1.0","event_id":"sha256:bcdb30941a99bbff48a492169a0c2170792b667b565444822d8b4a3ceac1176f"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:OXFILZ3XVNGJMGUP77ZFC3TAUJ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"How Well Do Models Follow Visual Instructions? VIBE: A Systematic Benchmark for Visual Instruction-Driven Image Editing","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Anna Korhonen, Chengzu Li, Chen Liang, Haochen Tian, Haodong Li, Huanyu Zhang, Liang Wang, Ruichuan An, Tieniu Tan, Xuehai Bai, Yifan Zhang, Zhang Zhang","submitted_at":"2026-02-02T09:24:45Z","abstract_excerpt":"Recent generative models have achieved remarkable progress in image editing. However, existing systems and benchmarks remain largely text-guided. In contrast, human communication is inherently multimodal, where visual instructions such as sketches efficiently convey spatial and structural intent. To address this gap, we introduce VIBE, the Visual Instruction Benchmark for Image Editing with a three-level interaction hierarchy that captures deictic grounding, morphological manipulation, and causal reasoning. Across these levels, we curate high-quality and diverse test cases that reflect progres"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.01851","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.01851/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-22T01:03:55Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Ugh9dy+4wxp2Rx/PqLq+ma/J3WDogwLCpKkAGlhbgyDDc6IsEenRnSGclWty5kC7fs/y3fQ7uFYZx0Mi69NlCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T08:35:21.386264Z"},"content_sha256":"cc6edabe7d94e2531697896c7fe220fbe823b0ff02e1fb99ab76fbd54f0a693e","schema_version":"1.0","event_id":"sha256:cc6edabe7d94e2531697896c7fe220fbe823b0ff02e1fb99ab76fbd54f0a693e"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/OXFILZ3XVNGJMGUP77ZFC3TAUJ/bundle.json","state_url":"https://pith.science/pith/OXFILZ3XVNGJMGUP77ZFC3TAUJ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/OXFILZ3XVNGJMGUP77ZFC3TAUJ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-24T08:35:21Z","links":{"resolver":"https://pith.science/pith/OXFILZ3XVNGJMGUP77ZFC3TAUJ","bundle":"https://pith.science/pith/OXFILZ3XVNGJMGUP77ZFC3TAUJ/bundle.json","state":"https://pith.science/pith/OXFILZ3XVNGJMGUP77ZFC3TAUJ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/OXFILZ3XVNGJMGUP77ZFC3TAUJ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:OXFILZ3XVNGJMGUP77ZFC3TAUJ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"44b30b69f9237bbec3a0530d1e174a09de25282fa30b61140f9039433b4ac0ab","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-02T09:24:45Z","title_canon_sha256":"bc044520ec797e2a23053797128c318ddbce64491b1abbbc15cb080783d63c34"},"schema_version":"1.0","source":{"id":"2602.01851","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.01851","created_at":"2026-05-22T01:03:55Z"},{"alias_kind":"arxiv_version","alias_value":"2602.01851v2","created_at":"2026-05-22T01:03:55Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.01851","created_at":"2026-05-22T01:03:55Z"},{"alias_kind":"pith_short_12","alias_value":"OXFILZ3XVNGJ","created_at":"2026-05-22T01:03:55Z"},{"alias_kind":"pith_short_16","alias_value":"OXFILZ3XVNGJMGUP","created_at":"2026-05-22T01:03:55Z"},{"alias_kind":"pith_short_8","alias_value":"OXFILZ3X","created_at":"2026-05-22T01:03:55Z"}],"graph_snapshots":[{"event_id":"sha256:cc6edabe7d94e2531697896c7fe220fbe823b0ff02e1fb99ab76fbd54f0a693e","target":"graph","created_at":"2026-05-22T01:03:55Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2602.01851/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Recent generative models have achieved remarkable progress in image editing. However, existing systems and benchmarks remain largely text-guided. In contrast, human communication is inherently multimodal, where visual instructions such as sketches efficiently convey spatial and structural intent. To address this gap, we introduce VIBE, the Visual Instruction Benchmark for Image Editing with a three-level interaction hierarchy that captures deictic grounding, morphological manipulation, and causal reasoning. Across these levels, we curate high-quality and diverse test cases that reflect progres","authors_text":"Anna Korhonen, Chengzu Li, Chen Liang, Haochen Tian, Haodong Li, Huanyu Zhang, Liang Wang, Ruichuan An, Tieniu Tan, Xuehai Bai, Yifan Zhang, Zhang Zhang","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-02T09:24:45Z","title":"How Well Do Models Follow Visual Instructions? VIBE: A Systematic Benchmark for Visual Instruction-Driven Image Editing"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.01851","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:bcdb30941a99bbff48a492169a0c2170792b667b565444822d8b4a3ceac1176f","target":"record","created_at":"2026-05-22T01:03:55Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"44b30b69f9237bbec3a0530d1e174a09de25282fa30b61140f9039433b4ac0ab","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-02T09:24:45Z","title_canon_sha256":"bc044520ec797e2a23053797128c318ddbce64491b1abbbc15cb080783d63c34"},"schema_version":"1.0","source":{"id":"2602.01851","kind":"arxiv","version":2}},"canonical_sha256":"75ca85e777ab4c961a8ffff2516e60a27e1c01fed4880f442ff36ceddccf033d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"75ca85e777ab4c961a8ffff2516e60a27e1c01fed4880f442ff36ceddccf033d","first_computed_at":"2026-05-22T01:03:55.826794Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-22T01:03:55.826794Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"RB8XUbJlx7yrRf824A4moV8YSW5gCWWGUjh18k/NLz60MfcZVyW3Wwo0MXlU9iB5wAfnS+mHA8E3Cyjqhsx0BQ==","signature_status":"signed_v1","signed_at":"2026-05-22T01:03:55.827787Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.01851","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:bcdb30941a99bbff48a492169a0c2170792b667b565444822d8b4a3ceac1176f","sha256:cc6edabe7d94e2531697896c7fe220fbe823b0ff02e1fb99ab76fbd54f0a693e"],"state_sha256":"87d2e4289b1226a97731b2a8b3cbf7aee63b7db4af49eb06faf6934eb34d251c"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"1js5z98T/sIJTAiXBI/Ggbfgc/+h75yKWVUu6CJG+fbgsapZSAbiSjNN7YIoTv7zDsEQL6nGMMPS3kcZFznMAQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-24T08:35:21.390293Z","bundle_sha256":"cc00f89db672a687d35d31a85da2beef587a223bf0e33a26934be036ea17a501"}}