{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:WBX46RPTUXTZZQPZ5TFC3MIO7P","short_pith_number":"pith:WBX46RPT","canonical_record":{"source":{"id":"2605.21954","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-21T03:40:22Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"9cbbc407d74feff76c3b8a3db3558c410bad05b602d1e487fc1bb7dfcc5d78ad","abstract_canon_sha256":"3939f615e9494e554aad6392f42ae78ac6f9680a207f9e53ca0530540655bfd1"},"schema_version":"1.0"},"canonical_sha256":"b06fcf45f3a5e79cc1f9ecca2db10efbe3380ad0e9f634a8a13c13b75809c0d5","source":{"kind":"arxiv","id":"2605.21954","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.21954","created_at":"2026-05-22T01:04:16Z"},{"alias_kind":"arxiv_version","alias_value":"2605.21954v1","created_at":"2026-05-22T01:04:16Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.21954","created_at":"2026-05-22T01:04:16Z"},{"alias_kind":"pith_short_12","alias_value":"WBX46RPTUXTZ","created_at":"2026-05-22T01:04:16Z"},{"alias_kind":"pith_short_16","alias_value":"WBX46RPTUXTZZQPZ","created_at":"2026-05-22T01:04:16Z"},{"alias_kind":"pith_short_8","alias_value":"WBX46RPT","created_at":"2026-05-22T01:04:16Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:WBX46RPTUXTZZQPZ5TFC3MIO7P","target":"record","payload":{"canonical_record":{"source":{"id":"2605.21954","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-21T03:40:22Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"9cbbc407d74feff76c3b8a3db3558c410bad05b602d1e487fc1bb7dfcc5d78ad","abstract_canon_sha256":"3939f615e9494e554aad6392f42ae78ac6f9680a207f9e53ca0530540655bfd1"},"schema_version":"1.0"},"canonical_sha256":"b06fcf45f3a5e79cc1f9ecca2db10efbe3380ad0e9f634a8a13c13b75809c0d5","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T01:04:16.985197Z","signature_b64":"wTZ+q04PBEikBiTpIVIxKYo3cOD6q2oV1ZTYKe4rEuysJ1fErmezz1ylN4XDbOQViC5w6AS5GUOLiYVzQG7NCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b06fcf45f3a5e79cc1f9ecca2db10efbe3380ad0e9f634a8a13c13b75809c0d5","last_reissued_at":"2026-05-22T01:04:16.984430Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T01:04:16.984430Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.21954","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-22T01:04:16Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Ql0fjpdrjj3QgL+fi4ucoD95c35fH/uv2rkd+JEkk+YOIfmgsQiDqWvsK2cYh8f6/2GqqUhm2Y8eFCiCZtjrBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T06:02:46.523493Z"},"content_sha256":"0eed18d6b54827a0ddf6ba6f60596b5e140466a671554fcaa05c91225f22210f","schema_version":"1.0","event_id":"sha256:0eed18d6b54827a0ddf6ba6f60596b5e140466a671554fcaa05c91225f22210f"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:WBX46RPTUXTZZQPZ5TFC3MIO7P","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"MLLMs Know When Before Speaking: Revealing and Recovering Temporal Grounding via Attention Cues","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Dazhao Du, Eric Liu, Jian Liu, Liao Duan, Song Guo, Tao Han, Xi Chen, Yujia Zhang","submitted_at":"2026-05-21T03:40:22Z","abstract_excerpt":"Video temporal grounding (VTG), which localizes the start and end times of a queried event in an untrimmed video, is a key test of whether multimodal large language models (MLLMs) understand not only what happens but also when it happens. Although modern MLLMs describe video content fluently, their timestamp predictions remain unreliable, while existing remedies either require costly post-training on temporal annotations or rely on coarse training-free heuristics. In this work, we probe the cross-modal attention of MLLMs and uncover a perception-generation gap. Our key finding is that MLLMs of"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.21954","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.21954/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-22T01:04:16Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"gDV4pHjw/p41Cct9Eq7c807q2FhCGaMWixZ+b2iszFPlQjqCH5ASxhAKN/YZBWgmhUcTTCzLi0hDhNNIE+sSBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T06:02:46.524195Z"},"content_sha256":"60ca3556c23fe5919cacd5bf5602c278695710cc8ba7b0497052715c1df37ab3","schema_version":"1.0","event_id":"sha256:60ca3556c23fe5919cacd5bf5602c278695710cc8ba7b0497052715c1df37ab3"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/WBX46RPTUXTZZQPZ5TFC3MIO7P/bundle.json","state_url":"https://pith.science/pith/WBX46RPTUXTZZQPZ5TFC3MIO7P/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/WBX46RPTUXTZZQPZ5TFC3MIO7P/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T06:02:46Z","links":{"resolver":"https://pith.science/pith/WBX46RPTUXTZZQPZ5TFC3MIO7P","bundle":"https://pith.science/pith/WBX46RPTUXTZZQPZ5TFC3MIO7P/bundle.json","state":"https://pith.science/pith/WBX46RPTUXTZZQPZ5TFC3MIO7P/state.json","well_known_bundle":"https://pith.science/.well-known/pith/WBX46RPTUXTZZQPZ5TFC3MIO7P/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:WBX46RPTUXTZZQPZ5TFC3MIO7P","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"3939f615e9494e554aad6392f42ae78ac6f9680a207f9e53ca0530540655bfd1","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-21T03:40:22Z","title_canon_sha256":"9cbbc407d74feff76c3b8a3db3558c410bad05b602d1e487fc1bb7dfcc5d78ad"},"schema_version":"1.0","source":{"id":"2605.21954","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.21954","created_at":"2026-05-22T01:04:16Z"},{"alias_kind":"arxiv_version","alias_value":"2605.21954v1","created_at":"2026-05-22T01:04:16Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.21954","created_at":"2026-05-22T01:04:16Z"},{"alias_kind":"pith_short_12","alias_value":"WBX46RPTUXTZ","created_at":"2026-05-22T01:04:16Z"},{"alias_kind":"pith_short_16","alias_value":"WBX46RPTUXTZZQPZ","created_at":"2026-05-22T01:04:16Z"},{"alias_kind":"pith_short_8","alias_value":"WBX46RPT","created_at":"2026-05-22T01:04:16Z"}],"graph_snapshots":[{"event_id":"sha256:60ca3556c23fe5919cacd5bf5602c278695710cc8ba7b0497052715c1df37ab3","target":"graph","created_at":"2026-05-22T01:04:16Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.21954/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Video temporal grounding (VTG), which localizes the start and end times of a queried event in an untrimmed video, is a key test of whether multimodal large language models (MLLMs) understand not only what happens but also when it happens. Although modern MLLMs describe video content fluently, their timestamp predictions remain unreliable, while existing remedies either require costly post-training on temporal annotations or rely on coarse training-free heuristics. In this work, we probe the cross-modal attention of MLLMs and uncover a perception-generation gap. Our key finding is that MLLMs of","authors_text":"Dazhao Du, Eric Liu, Jian Liu, Liao Duan, Song Guo, Tao Han, Xi Chen, Yujia Zhang","cross_cats":["cs.AI"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-21T03:40:22Z","title":"MLLMs Know When Before Speaking: Revealing and Recovering Temporal Grounding via Attention Cues"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.21954","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:0eed18d6b54827a0ddf6ba6f60596b5e140466a671554fcaa05c91225f22210f","target":"record","created_at":"2026-05-22T01:04:16Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"3939f615e9494e554aad6392f42ae78ac6f9680a207f9e53ca0530540655bfd1","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-21T03:40:22Z","title_canon_sha256":"9cbbc407d74feff76c3b8a3db3558c410bad05b602d1e487fc1bb7dfcc5d78ad"},"schema_version":"1.0","source":{"id":"2605.21954","kind":"arxiv","version":1}},"canonical_sha256":"b06fcf45f3a5e79cc1f9ecca2db10efbe3380ad0e9f634a8a13c13b75809c0d5","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b06fcf45f3a5e79cc1f9ecca2db10efbe3380ad0e9f634a8a13c13b75809c0d5","first_computed_at":"2026-05-22T01:04:16.984430Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-22T01:04:16.984430Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"wTZ+q04PBEikBiTpIVIxKYo3cOD6q2oV1ZTYKe4rEuysJ1fErmezz1ylN4XDbOQViC5w6AS5GUOLiYVzQG7NCg==","signature_status":"signed_v1","signed_at":"2026-05-22T01:04:16.985197Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.21954","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:0eed18d6b54827a0ddf6ba6f60596b5e140466a671554fcaa05c91225f22210f","sha256:60ca3556c23fe5919cacd5bf5602c278695710cc8ba7b0497052715c1df37ab3"],"state_sha256":"992e0adeb33ac05f500e584f23a1c3aa0b48aed13bb6bdb6769f95c3777704b2"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"1BRryGpFUI/pKchF8EJBAckN3i1iD2Hbl/UDjlwMh5fHIXCrEIbe+5qyiSWSK3KiwUfe4Yr/3KAwKWMQqn5BAA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T06:02:46.528536Z","bundle_sha256":"dd9090860f10f22fd765fc19d06b027ec332b540ff92aa518b3c5a2d2b931e97"}}