{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:SNHEDWXO7SQJMA6PWZ7WSUEOCR","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"628715a9745f18954b667c7cf58ab3600d2f1fc7a7cabb81666699325a2f17ed","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T09:49:32Z","title_canon_sha256":"137561c728ed7e7f958ed5bf73182b4dff0136a9a5fb4f7abcdfafa92a51e473"},"schema_version":"1.0","source":{"id":"2605.14635","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14635","created_at":"2026-05-17T23:39:03Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14635v1","created_at":"2026-05-17T23:39:03Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14635","created_at":"2026-05-17T23:39:03Z"},{"alias_kind":"pith_short_12","alias_value":"SNHEDWXO7SQJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"SNHEDWXO7SQJMA6P","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"SNHEDWXO","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:1b0b397d8921789441c354a22f7822552ec93ef8d5cd30ed4eb81a1e794f364d","target":"graph","created_at":"2026-05-17T23:39:03Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Recent MLLMs show measurable progress on visual emotion prediction with the new multi-label benchmark, yet substantial room for improvement remains and LLM-as-a-judge does not consistently improve performance."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"Aggregating independent selections from twenty annotators per image produces a reliable and representative distribution of the emotions actually evoked by each image."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MultiEmo-Bench supplies 10,344 images with aggregated multi-label emotion votes from 20 annotators each to evaluate MLLMs on dominant emotion and full distribution prediction."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A multi-label benchmark with aggregated annotator votes shows recent MLLMs have advanced on visual emotion prediction but still leave substantial room for improvement."}],"snapshot_sha256":"05074797d2b386b954d5eae07c330bbd6b9c41ed6cf742848baa7d5230d06a41"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"70c8260b091248c20335ee1ad24cd5693bab71edecde846cb1b410c15b92605d"},"paper":{"abstract_excerpt":"This paper introduces a multi-label visual emotion analysis benchmark dataset for comprehensively evaluating the ability of multimodal large language models (MLLMs) to predict the emotions evoked by images. Recent user studies report an unintuitive finding: humans may prefer the predictions of MLLMs over the labels in existing datasets. We argue that this phenomenon stems from the suboptimal annotation scheme used in existing datasets, where each annotator is shown a single candidate emotion for each image and judges whether it is evoked or not. This approach is clearly limited because a singl","authors_text":"Mo Fan, Ryotaro Shimizu, Takashi Wada, Takuya Furusawa, Tianwei Chen, Yuki Hirakawa","cross_cats":["cs.AI"],"headline":"A multi-label benchmark with aggregated annotator votes shows recent MLLMs have advanced on visual emotion prediction but still leave substantial room for improvement.","license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T09:49:32Z","title":"MultiEmo-Bench: Multi-label Visual Emotion Analysis for Multi-modal Large Language Models"},"references":{"count":34,"internal_anchors":5,"resolved_work":34,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Achlioptas,P.,Ovsjanikov,M.,Haydarov,K.,Elhoseiny,M.,Guibas,L.J.:Artemis: Affective language for visual art. In: CVPR. pp. 11569–11579 (2021)","work_id":"bdf45c06-45ec-409c-babf-09e1becc199a","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Anthropic: System card: Claude opus 4 & claude sonnet 4. Tech. rep., Anthropic (May 2025)","work_id":"455fdd87-9851-43bb-99f8-87c5a1be7273","year":2025},{"cited_arxiv_id":"2511.21631","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Bhattacharyya, S., Wang, J.Z.: Evaluating vision-language models for emotion recognition. In: Chiruzzo, L., Ritter, A., Wang, L. (eds.) NAACL Findings. pp. 1798–1820. Association for Computational Lin","work_id":"71267ea9-ea02-4514-b3de-dd0d4c957c59","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Chen, L., Li, J., Dong, X., Zhang, P., He, C., Wang, J., Zhao, F., Lin, D.: Sharegpt4v: Improving large multi-modal models with better captions. In: ECCV. vol. 15075, pp. 370–387 (2024)","work_id":"33cd9ab7-e9b9-4333-93f2-65b7cb51c3e5","year":2024}],"snapshot_sha256":"e8ef180eb95c8fbdd3729ce6916af9a38055eb7260708cfcfa3080dbb9510243"},"source":{"id":"2605.14635","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T05:43:20.081381Z","id":"2beec11f-e48a-4e06-bdf1-5a5219335834","model_set":{"reader":"grok-4.3"},"one_line_summary":"MultiEmo-Bench supplies 10,344 images with aggregated multi-label emotion votes from 20 annotators each to evaluate MLLMs on dominant emotion and full distribution prediction.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A multi-label benchmark with aggregated annotator votes shows recent MLLMs have advanced on visual emotion prediction but still leave substantial room for improvement.","strongest_claim":"Recent MLLMs show measurable progress on visual emotion prediction with the new multi-label benchmark, yet substantial room for improvement remains and LLM-as-a-judge does not consistently improve performance.","weakest_assumption":"Aggregating independent selections from twenty annotators per image produces a reliable and representative distribution of the emotions actually evoked by each image."}},"verdict_id":"2beec11f-e48a-4e06-bdf1-5a5219335834"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:3493203f9f4f6373331bf517ea80ff1c032cf4c77bf56c6ef52adf975f8dd1c8","target":"record","created_at":"2026-05-17T23:39:03Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"628715a9745f18954b667c7cf58ab3600d2f1fc7a7cabb81666699325a2f17ed","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T09:49:32Z","title_canon_sha256":"137561c728ed7e7f958ed5bf73182b4dff0136a9a5fb4f7abcdfafa92a51e473"},"schema_version":"1.0","source":{"id":"2605.14635","kind":"arxiv","version":1}},"canonical_sha256":"934e41daeefca09603cfb67f69508e1470e285b921bda65b6789932ca7343b45","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"934e41daeefca09603cfb67f69508e1470e285b921bda65b6789932ca7343b45","first_computed_at":"2026-05-17T23:39:03.934180Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:03.934180Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"YRVMSbqWHSsicoVd2+EL057ru8xqMTGpo6STbh1Oec2jJ5ErDYmbr80SG2Js0BdfxkbhCEYmo9iRCiJd75mYDA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:03.934883Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14635","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:3493203f9f4f6373331bf517ea80ff1c032cf4c77bf56c6ef52adf975f8dd1c8","sha256:1b0b397d8921789441c354a22f7822552ec93ef8d5cd30ed4eb81a1e794f364d"],"state_sha256":"440272c09253d69f0c9ed8b40e83295afcb9beba53ce6e7f5b51ed0819883576"}