{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:WHD2ZAOCEHRY3P4PBHP5FF2RDY","short_pith_number":"pith:WHD2ZAOC","canonical_record":{"source":{"id":"2505.21374","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-05-27T16:05:01Z","cross_cats_sorted":[],"title_canon_sha256":"1037a1b2b279b5f0742dc6dfa56f6ffc64357cdb3e474d708d8ec7e95ff08200","abstract_canon_sha256":"7d62d4aba317088c9ae2a9712056750f44141128f5c8fcb45341f9e87195b8f1"},"schema_version":"1.0"},"canonical_sha256":"b1c7ac81c221e38dbf8f09dfd297511e28e68d6946a16ac84740f6bd226f0367","source":{"kind":"arxiv","id":"2505.21374","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2505.21374","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2505.21374v1","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2505.21374","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"WHD2ZAOCEHRY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WHD2ZAOCEHRY3P4P","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WHD2ZAOC","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:WHD2ZAOCEHRY3P4PBHP5FF2RDY","target":"record","payload":{"canonical_record":{"source":{"id":"2505.21374","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-05-27T16:05:01Z","cross_cats_sorted":[],"title_canon_sha256":"1037a1b2b279b5f0742dc6dfa56f6ffc64357cdb3e474d708d8ec7e95ff08200","abstract_canon_sha256":"7d62d4aba317088c9ae2a9712056750f44141128f5c8fcb45341f9e87195b8f1"},"schema_version":"1.0"},"canonical_sha256":"b1c7ac81c221e38dbf8f09dfd297511e28e68d6946a16ac84740f6bd226f0367","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:14.952899Z","signature_b64":"DPxV2gSktf/tu7Om/tLwv/hBsvnVq/UXncyTX5Fm6H79zbcHZvLCbikHs/KxGZ16hJUoItoop6k3vd1cYtSrBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b1c7ac81c221e38dbf8f09dfd297511e28e68d6946a16ac84740f6bd226f0367","last_reissued_at":"2026-05-17T23:38:14.952213Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:14.952213Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2505.21374","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"nV1wwi/AjuwmgZcNsqm92BTqiwikM3zqrugTRHA8jH96pXhIsdCh4To4ahzmiHBjP/SqIy5B8Rv84eFmE6ANDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T01:20:09.677457Z"},"content_sha256":"f534dd5b5a57e12c42a02fec54a855a58e2e2840faa60b5025295f051ce4ddce","schema_version":"1.0","event_id":"sha256:f534dd5b5a57e12c42a02fec54a855a58e2e2840faa60b5025295f051ce4ddce"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:WHD2ZAOCEHRY3P4PBHP5FF2RDY","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Video-Holmes: Can MLLM Think Like Holmes for Complex Video Reasoning?","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Multimodal models perceive video details but fail to integrate scattered clues, scoring at most 45 percent on a new Holmes-inspired benchmark.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Jing Liao, Junhao Cheng, Teng Wang, Ying Shan, Yixiao Ge, Yuying Ge","submitted_at":"2025-05-27T16:05:01Z","abstract_excerpt":"Recent advances in CoT reasoning and RL post-training have been reported to enhance video reasoning capabilities of MLLMs. This progress naturally raises a question: can these models perform complex video reasoning in a manner comparable to human experts? However, existing video benchmarks primarily evaluate visual perception and grounding abilities, with questions that can be answered based on explicit prompts or isolated visual cues. Such benchmarks do not fully capture the intricacies of real-world reasoning, where humans must actively search for, integrate, and analyze multiple clues befor"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our comprehensive evaluation of state-of-the-art MLLMs reveals that, while these models generally excel at visual perception, they encounter substantial difficulties with integrating information and often miss critical clues. For example, the best-performing model, Gemini-2.5-Pro, achieves an accuracy of only 45%, with most models scoring below 40%.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that the seven manually designed tasks from suspense films accurately require and measure active search, integration, and analysis of multiple clues in a manner comparable to human expert reasoning.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Video-Holmes benchmark shows top MLLMs achieve at most 45% accuracy on tasks needing integration of multiple clues from suspense films, unlike existing perception-focused tests.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Multimodal models perceive video details but fail to integrate scattered clues, scoring at most 45 percent on a new Holmes-inspired benchmark.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"3704675f962dacd801246b4cb3c35d05ba10827bd3a3fc69eab6d8ec07ac857a"},"source":{"id":"2505.21374","kind":"arxiv","version":1},"verdict":{"id":"ebc0c269-76f7-412b-9420-4ff5479fcbe3","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T05:36:20.043648Z","strongest_claim":"Our comprehensive evaluation of state-of-the-art MLLMs reveals that, while these models generally excel at visual perception, they encounter substantial difficulties with integrating information and often miss critical clues. For example, the best-performing model, Gemini-2.5-Pro, achieves an accuracy of only 45%, with most models scoring below 40%.","one_line_summary":"Video-Holmes benchmark shows top MLLMs achieve at most 45% accuracy on tasks needing integration of multiple clues from suspense films, unlike existing perception-focused tests.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that the seven manually designed tasks from suspense films accurately require and measure active search, integration, and analysis of multiple clues in a manner comparable to human expert reasoning.","pith_extraction_headline":"Multimodal models perceive video details but fail to integrate scattered clues, scoring at most 45 percent on a new Holmes-inspired benchmark."},"references":{"count":51,"sample":[{"doi":"","year":2022,"title":"Chain-of-thought prompting elicits reasoning in large language models","work_id":"4160f614-809e-4eb0-8951-702539a20d52","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","ref_index":2,"cited_arxiv_id":"2402.03300","is_internal_anchor":true},{"doi":"","year":2025,"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","ref_index":3,"cited_arxiv_id":"2501.12948","is_internal_anchor":true},{"doi":"","year":2024,"title":"Introducing openai o1","work_id":"993616f2-1ea2-492f-857c-c3236709e4af","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"OpenAI. Openai o3. 2025. 2, 9","work_id":"a6e82b4b-c165-409b-b4cc-1512baf99410","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":51,"snapshot_sha256":"7ea1c5d586d2fb268f50ac2a75fdf39861e27fd04aecb63a1a3fd1cef3ba6378","internal_anchors":21},"formal_canon":{"evidence_count":3,"snapshot_sha256":"b78b4c121060f46eb3708a4ffc3c6c4462c3c98da84b660eeddf9b30b2c974b3"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"ebc0c269-76f7-412b-9420-4ff5479fcbe3"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"BUKKfmGlBdXPEtUVH3D8LjUT9udlEUOE4lyTJezEkuW3LtI8dDjI3q/bjq4YokhKc9pJBVeLp/SPVEU1VbrPAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T01:20:09.678047Z"},"content_sha256":"c9a604ee087b00eb1a283c8d9b3a6bde573e0845287821193781fbbc22317149","schema_version":"1.0","event_id":"sha256:c9a604ee087b00eb1a283c8d9b3a6bde573e0845287821193781fbbc22317149"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/WHD2ZAOCEHRY3P4PBHP5FF2RDY/bundle.json","state_url":"https://pith.science/pith/WHD2ZAOCEHRY3P4PBHP5FF2RDY/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/WHD2ZAOCEHRY3P4PBHP5FF2RDY/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T01:20:09Z","links":{"resolver":"https://pith.science/pith/WHD2ZAOCEHRY3P4PBHP5FF2RDY","bundle":"https://pith.science/pith/WHD2ZAOCEHRY3P4PBHP5FF2RDY/bundle.json","state":"https://pith.science/pith/WHD2ZAOCEHRY3P4PBHP5FF2RDY/state.json","well_known_bundle":"https://pith.science/.well-known/pith/WHD2ZAOCEHRY3P4PBHP5FF2RDY/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:WHD2ZAOCEHRY3P4PBHP5FF2RDY","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7d62d4aba317088c9ae2a9712056750f44141128f5c8fcb45341f9e87195b8f1","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-05-27T16:05:01Z","title_canon_sha256":"1037a1b2b279b5f0742dc6dfa56f6ffc64357cdb3e474d708d8ec7e95ff08200"},"schema_version":"1.0","source":{"id":"2505.21374","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2505.21374","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2505.21374v1","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2505.21374","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"WHD2ZAOCEHRY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WHD2ZAOCEHRY3P4P","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WHD2ZAOC","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:c9a604ee087b00eb1a283c8d9b3a6bde573e0845287821193781fbbc22317149","target":"graph","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Our comprehensive evaluation of state-of-the-art MLLMs reveals that, while these models generally excel at visual perception, they encounter substantial difficulties with integrating information and often miss critical clues. For example, the best-performing model, Gemini-2.5-Pro, achieves an accuracy of only 45%, with most models scoring below 40%."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that the seven manually designed tasks from suspense films accurately require and measure active search, integration, and analysis of multiple clues in a manner comparable to human expert reasoning."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Video-Holmes benchmark shows top MLLMs achieve at most 45% accuracy on tasks needing integration of multiple clues from suspense films, unlike existing perception-focused tests."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Multimodal models perceive video details but fail to integrate scattered clues, scoring at most 45 percent on a new Holmes-inspired benchmark."}],"snapshot_sha256":"3704675f962dacd801246b4cb3c35d05ba10827bd3a3fc69eab6d8ec07ac857a"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"b78b4c121060f46eb3708a4ffc3c6c4462c3c98da84b660eeddf9b30b2c974b3"},"paper":{"abstract_excerpt":"Recent advances in CoT reasoning and RL post-training have been reported to enhance video reasoning capabilities of MLLMs. This progress naturally raises a question: can these models perform complex video reasoning in a manner comparable to human experts? However, existing video benchmarks primarily evaluate visual perception and grounding abilities, with questions that can be answered based on explicit prompts or isolated visual cues. Such benchmarks do not fully capture the intricacies of real-world reasoning, where humans must actively search for, integrate, and analyze multiple clues befor","authors_text":"Jing Liao, Junhao Cheng, Teng Wang, Ying Shan, Yixiao Ge, Yuying Ge","cross_cats":[],"headline":"Multimodal models perceive video details but fail to integrate scattered clues, scoring at most 45 percent on a new Holmes-inspired benchmark.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-05-27T16:05:01Z","title":"Video-Holmes: Can MLLM Think Like Holmes for Complex Video Reasoning?"},"references":{"count":51,"internal_anchors":21,"resolved_work":51,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Chain-of-thought prompting elicits reasoning in large language models","work_id":"4160f614-809e-4eb0-8951-702539a20d52","year":2022},{"cited_arxiv_id":"2402.03300","doi":"","is_internal_anchor":true,"ref_index":2,"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","year":2024},{"cited_arxiv_id":"2501.12948","doi":"","is_internal_anchor":true,"ref_index":3,"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Introducing openai o1","work_id":"993616f2-1ea2-492f-857c-c3236709e4af","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"OpenAI. Openai o3. 2025. 2, 9","work_id":"a6e82b4b-c165-409b-b4cc-1512baf99410","year":2025}],"snapshot_sha256":"7ea1c5d586d2fb268f50ac2a75fdf39861e27fd04aecb63a1a3fd1cef3ba6378"},"source":{"id":"2505.21374","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-17T05:36:20.043648Z","id":"ebc0c269-76f7-412b-9420-4ff5479fcbe3","model_set":{"reader":"grok-4.3"},"one_line_summary":"Video-Holmes benchmark shows top MLLMs achieve at most 45% accuracy on tasks needing integration of multiple clues from suspense films, unlike existing perception-focused tests.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Multimodal models perceive video details but fail to integrate scattered clues, scoring at most 45 percent on a new Holmes-inspired benchmark.","strongest_claim":"Our comprehensive evaluation of state-of-the-art MLLMs reveals that, while these models generally excel at visual perception, they encounter substantial difficulties with integrating information and often miss critical clues. For example, the best-performing model, Gemini-2.5-Pro, achieves an accuracy of only 45%, with most models scoring below 40%.","weakest_assumption":"The assumption that the seven manually designed tasks from suspense films accurately require and measure active search, integration, and analysis of multiple clues in a manner comparable to human expert reasoning."}},"verdict_id":"ebc0c269-76f7-412b-9420-4ff5479fcbe3"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f534dd5b5a57e12c42a02fec54a855a58e2e2840faa60b5025295f051ce4ddce","target":"record","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7d62d4aba317088c9ae2a9712056750f44141128f5c8fcb45341f9e87195b8f1","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-05-27T16:05:01Z","title_canon_sha256":"1037a1b2b279b5f0742dc6dfa56f6ffc64357cdb3e474d708d8ec7e95ff08200"},"schema_version":"1.0","source":{"id":"2505.21374","kind":"arxiv","version":1}},"canonical_sha256":"b1c7ac81c221e38dbf8f09dfd297511e28e68d6946a16ac84740f6bd226f0367","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b1c7ac81c221e38dbf8f09dfd297511e28e68d6946a16ac84740f6bd226f0367","first_computed_at":"2026-05-17T23:38:14.952213Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:14.952213Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"DPxV2gSktf/tu7Om/tLwv/hBsvnVq/UXncyTX5Fm6H79zbcHZvLCbikHs/KxGZ16hJUoItoop6k3vd1cYtSrBQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:14.952899Z","signed_message":"canonical_sha256_bytes"},"source_id":"2505.21374","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f534dd5b5a57e12c42a02fec54a855a58e2e2840faa60b5025295f051ce4ddce","sha256:c9a604ee087b00eb1a283c8d9b3a6bde573e0845287821193781fbbc22317149"],"state_sha256":"9647bc84c3526c902f573f0f2df9216c5c72f21d1c53479c3b8a1b310fdf1803"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"NIYEASnbniEFOQ3Gp2zJUG6cKkBMhSXpOllVpwOOlZWtt/JPwsx8yVWLFspWphO+W1lW0pJaR7ZV9mlkIra+CA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T01:20:09.681607Z","bundle_sha256":"23ebf469cfd4ed022020ade7b8f0055027d7a699e5dbd3a92c8ca97ca1440df4"}}