{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:A6HWBVDMY7V2WYDJYKIAZEEYNB","short_pith_number":"pith:A6HWBVDM","canonical_record":{"source":{"id":"2605.19223","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-19T00:48:14Z","cross_cats_sorted":[],"title_canon_sha256":"e9c6e92a1fc2cb018f3ac389c2d2f8720b79378f3e7985a6718a40f14087461f","abstract_canon_sha256":"4783a6cbf49254359676c9c9ed0b51cc6f5a6d7892ca3597dc5cfb17ed78456e"},"schema_version":"1.0"},"canonical_sha256":"078f60d46cc7ebab6069c2900c9098684886680ab33a952722238ac53331fe1f","source":{"kind":"arxiv","id":"2605.19223","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.19223","created_at":"2026-05-20T01:05:34Z"},{"alias_kind":"arxiv_version","alias_value":"2605.19223v1","created_at":"2026-05-20T01:05:34Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.19223","created_at":"2026-05-20T01:05:34Z"},{"alias_kind":"pith_short_12","alias_value":"A6HWBVDMY7V2","created_at":"2026-05-20T01:05:34Z"},{"alias_kind":"pith_short_16","alias_value":"A6HWBVDMY7V2WYDJ","created_at":"2026-05-20T01:05:34Z"},{"alias_kind":"pith_short_8","alias_value":"A6HWBVDM","created_at":"2026-05-20T01:05:34Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:A6HWBVDMY7V2WYDJYKIAZEEYNB","target":"record","payload":{"canonical_record":{"source":{"id":"2605.19223","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-19T00:48:14Z","cross_cats_sorted":[],"title_canon_sha256":"e9c6e92a1fc2cb018f3ac389c2d2f8720b79378f3e7985a6718a40f14087461f","abstract_canon_sha256":"4783a6cbf49254359676c9c9ed0b51cc6f5a6d7892ca3597dc5cfb17ed78456e"},"schema_version":"1.0"},"canonical_sha256":"078f60d46cc7ebab6069c2900c9098684886680ab33a952722238ac53331fe1f","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T01:05:34.335487Z","signature_b64":"oJWee3CKJNKw1Smrbpv5jBrJHNqqv39k3dXtWfba3WpNZG6cZcI9IburHCtlOD3rmKSxsfOUgfUObakh+KuTDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"078f60d46cc7ebab6069c2900c9098684886680ab33a952722238ac53331fe1f","last_reissued_at":"2026-05-20T01:05:34.334835Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T01:05:34.334835Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.19223","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:05:34Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"TnVslmzJ+i27gUnpCGNnu+Cf2NqgvLbJapayMWfF56MPIdWRG6F0htlA4Kr6lNO464WtntSYJ6L2/VW9AjJxBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T04:30:55.375866Z"},"content_sha256":"f2f806de5b7d01e5e4e5a0b7d84a7ea68b788e1718bcbb9a6772564f5b1e17a4","schema_version":"1.0","event_id":"sha256:f2f806de5b7d01e5e4e5a0b7d84a7ea68b788e1718bcbb9a6772564f5b1e17a4"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:A6HWBVDMY7V2WYDJYKIAZEEYNB","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"HAVEN: Hierarchically Aligned Multimodal Benchmark for Unified Video Understanding","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"HaoPeng Zhang, Mengqi Shi","submitted_at":"2026-05-19T00:48:14Z","abstract_excerpt":"While Multimodal Large Language Models (MLLMs) exhibit strong performance on standard video tasks, their ability to faithfully summarize and reason over complex narratives remains poorly evaluated. Existing summarization benchmarks fragment supervision across isolated granularities, such as keyframes, key shots, or disjointed text summaries, failing to capture the inherently hierarchical structure of cross-modal alignment. To address this critical gap, we introduce HAVEN, a hierarchically aligned multimodal benchmark for unified video understanding. HAVEN pioneers a fully granular (frame, shot"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.19223","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.19223/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:05:34Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"3JubjJHmScB8E62Nr98qZr/IBb850P6hTMSO0yRkPAf9N67RkYLhsAWjgwFG+g6jRHBbou9Zq6+W2viKr4voCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T04:30:55.376565Z"},"content_sha256":"b136e1bdce22c76a6d66cadeb6cfdc8ae42b279bc739c2d0438c60a4659aef4f","schema_version":"1.0","event_id":"sha256:b136e1bdce22c76a6d66cadeb6cfdc8ae42b279bc739c2d0438c60a4659aef4f"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/A6HWBVDMY7V2WYDJYKIAZEEYNB/bundle.json","state_url":"https://pith.science/pith/A6HWBVDMY7V2WYDJYKIAZEEYNB/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/A6HWBVDMY7V2WYDJYKIAZEEYNB/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T04:30:55Z","links":{"resolver":"https://pith.science/pith/A6HWBVDMY7V2WYDJYKIAZEEYNB","bundle":"https://pith.science/pith/A6HWBVDMY7V2WYDJYKIAZEEYNB/bundle.json","state":"https://pith.science/pith/A6HWBVDMY7V2WYDJYKIAZEEYNB/state.json","well_known_bundle":"https://pith.science/.well-known/pith/A6HWBVDMY7V2WYDJYKIAZEEYNB/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:A6HWBVDMY7V2WYDJYKIAZEEYNB","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"4783a6cbf49254359676c9c9ed0b51cc6f5a6d7892ca3597dc5cfb17ed78456e","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-19T00:48:14Z","title_canon_sha256":"e9c6e92a1fc2cb018f3ac389c2d2f8720b79378f3e7985a6718a40f14087461f"},"schema_version":"1.0","source":{"id":"2605.19223","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.19223","created_at":"2026-05-20T01:05:34Z"},{"alias_kind":"arxiv_version","alias_value":"2605.19223v1","created_at":"2026-05-20T01:05:34Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.19223","created_at":"2026-05-20T01:05:34Z"},{"alias_kind":"pith_short_12","alias_value":"A6HWBVDMY7V2","created_at":"2026-05-20T01:05:34Z"},{"alias_kind":"pith_short_16","alias_value":"A6HWBVDMY7V2WYDJ","created_at":"2026-05-20T01:05:34Z"},{"alias_kind":"pith_short_8","alias_value":"A6HWBVDM","created_at":"2026-05-20T01:05:34Z"}],"graph_snapshots":[{"event_id":"sha256:b136e1bdce22c76a6d66cadeb6cfdc8ae42b279bc739c2d0438c60a4659aef4f","target":"graph","created_at":"2026-05-20T01:05:34Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.19223/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"While Multimodal Large Language Models (MLLMs) exhibit strong performance on standard video tasks, their ability to faithfully summarize and reason over complex narratives remains poorly evaluated. Existing summarization benchmarks fragment supervision across isolated granularities, such as keyframes, key shots, or disjointed text summaries, failing to capture the inherently hierarchical structure of cross-modal alignment. To address this critical gap, we introduce HAVEN, a hierarchically aligned multimodal benchmark for unified video understanding. HAVEN pioneers a fully granular (frame, shot","authors_text":"HaoPeng Zhang, Mengqi Shi","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-19T00:48:14Z","title":"HAVEN: Hierarchically Aligned Multimodal Benchmark for Unified Video Understanding"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.19223","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f2f806de5b7d01e5e4e5a0b7d84a7ea68b788e1718bcbb9a6772564f5b1e17a4","target":"record","created_at":"2026-05-20T01:05:34Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"4783a6cbf49254359676c9c9ed0b51cc6f5a6d7892ca3597dc5cfb17ed78456e","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-19T00:48:14Z","title_canon_sha256":"e9c6e92a1fc2cb018f3ac389c2d2f8720b79378f3e7985a6718a40f14087461f"},"schema_version":"1.0","source":{"id":"2605.19223","kind":"arxiv","version":1}},"canonical_sha256":"078f60d46cc7ebab6069c2900c9098684886680ab33a952722238ac53331fe1f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"078f60d46cc7ebab6069c2900c9098684886680ab33a952722238ac53331fe1f","first_computed_at":"2026-05-20T01:05:34.334835Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T01:05:34.334835Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"oJWee3CKJNKw1Smrbpv5jBrJHNqqv39k3dXtWfba3WpNZG6cZcI9IburHCtlOD3rmKSxsfOUgfUObakh+KuTDQ==","signature_status":"signed_v1","signed_at":"2026-05-20T01:05:34.335487Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.19223","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f2f806de5b7d01e5e4e5a0b7d84a7ea68b788e1718bcbb9a6772564f5b1e17a4","sha256:b136e1bdce22c76a6d66cadeb6cfdc8ae42b279bc739c2d0438c60a4659aef4f"],"state_sha256":"0bb4589fd2f4f09ad723fc4df4463b31e7bb1bf41edc9e6fa5d11f37ada188dc"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"gC28ybkDSTHY9DA74Hgo7GS1ZpskBBYFEl1ecTsar31205gwSSNwiTDchy/qOl8c/r+0kStUD4POyLAX2yNOBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T04:30:55.379839Z","bundle_sha256":"6eb815a9389b9478baca48aa1c88f13240e0a006a08e8975ef600a25e93b1990"}}