{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:TSOOYSZPJU76MPHXCEA4JLOQU2","short_pith_number":"pith:TSOOYSZP","schema_version":"1.0","canonical_sha256":"9c9cec4b2f4d3fe63cf71101c4add0a6bdf3debd9aae6a14a94c638bc38a8782","source":{"kind":"arxiv","id":"2606.03920","version":1},"attestation_state":"computed","paper":{"title":"Benchmarking Visual State Tracking in Multimodal Video Understanding","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Boyang Zheng, Ellis Brown, Hyunseok Lee, Jinwoo Shin, June Suk Choi, Nanye Ma, Oscar Michel, Pinzhi Huang, Saining Xie, Shusheng Yang, Sihyun Yu","submitted_at":"2026-06-02T17:12:05Z","abstract_excerpt":"Understanding a video requires more than recognizing isolated moments, as humans continuously track entities, states, and events over time. This capacity for visual state tracking is fundamental to video understanding, yet remains underexplored in current evaluations of Multimodal Large Language Models (MLLMs). We introduce Visual STAte Tracking benchmark (VSTAT), a video-based benchmark designed to diagnose visual state tracking in MLLMs. VSTAT consists of 834 clips drawn from both synthetic and real-world videos, paired with 1,500 questions that cannot be answered from any single frame or sh"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.03920","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-06-02T17:12:05Z","cross_cats_sorted":[],"title_canon_sha256":"e234980d17a1e68252ac2ce488ab7b1d483384bd28d7d84c75580c6e79750caa","abstract_canon_sha256":"81517c1699f582d13303637547fda0a6dad8085240ad75dfc06032636278f768"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-03T02:06:06.942391Z","signature_b64":"DvtraZbpsRu4MElFpPbahk/7Sw10+a3ejs8RS7UlaGY1tVAPLUpomvlYvT3Y18SzpTe7WD+xikzBg1nw8DQxAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9c9cec4b2f4d3fe63cf71101c4add0a6bdf3debd9aae6a14a94c638bc38a8782","last_reissued_at":"2026-06-03T02:06:06.941991Z","signature_status":"signed_v1","first_computed_at":"2026-06-03T02:06:06.941991Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Benchmarking Visual State Tracking in Multimodal Video Understanding","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Boyang Zheng, Ellis Brown, Hyunseok Lee, Jinwoo Shin, June Suk Choi, Nanye Ma, Oscar Michel, Pinzhi Huang, Saining Xie, Shusheng Yang, Sihyun Yu","submitted_at":"2026-06-02T17:12:05Z","abstract_excerpt":"Understanding a video requires more than recognizing isolated moments, as humans continuously track entities, states, and events over time. This capacity for visual state tracking is fundamental to video understanding, yet remains underexplored in current evaluations of Multimodal Large Language Models (MLLMs). We introduce Visual STAte Tracking benchmark (VSTAT), a video-based benchmark designed to diagnose visual state tracking in MLLMs. VSTAT consists of 834 clips drawn from both synthetic and real-world videos, paired with 1,500 questions that cannot be answered from any single frame or sh"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.03920","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.03920/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.03920","created_at":"2026-06-03T02:06:06.942052+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.03920v1","created_at":"2026-06-03T02:06:06.942052+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.03920","created_at":"2026-06-03T02:06:06.942052+00:00"},{"alias_kind":"pith_short_12","alias_value":"TSOOYSZPJU76","created_at":"2026-06-03T02:06:06.942052+00:00"},{"alias_kind":"pith_short_16","alias_value":"TSOOYSZPJU76MPHX","created_at":"2026-06-03T02:06:06.942052+00:00"},{"alias_kind":"pith_short_8","alias_value":"TSOOYSZP","created_at":"2026-06-03T02:06:06.942052+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/TSOOYSZPJU76MPHXCEA4JLOQU2","json":"https://pith.science/pith/TSOOYSZPJU76MPHXCEA4JLOQU2.json","graph_json":"https://pith.science/api/pith-number/TSOOYSZPJU76MPHXCEA4JLOQU2/graph.json","events_json":"https://pith.science/api/pith-number/TSOOYSZPJU76MPHXCEA4JLOQU2/events.json","paper":"https://pith.science/paper/TSOOYSZP"},"agent_actions":{"view_html":"https://pith.science/pith/TSOOYSZPJU76MPHXCEA4JLOQU2","download_json":"https://pith.science/pith/TSOOYSZPJU76MPHXCEA4JLOQU2.json","view_paper":"https://pith.science/paper/TSOOYSZP","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.03920&json=true","fetch_graph":"https://pith.science/api/pith-number/TSOOYSZPJU76MPHXCEA4JLOQU2/graph.json","fetch_events":"https://pith.science/api/pith-number/TSOOYSZPJU76MPHXCEA4JLOQU2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/TSOOYSZPJU76MPHXCEA4JLOQU2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/TSOOYSZPJU76MPHXCEA4JLOQU2/action/storage_attestation","attest_author":"https://pith.science/pith/TSOOYSZPJU76MPHXCEA4JLOQU2/action/author_attestation","sign_citation":"https://pith.science/pith/TSOOYSZPJU76MPHXCEA4JLOQU2/action/citation_signature","submit_replication":"https://pith.science/pith/TSOOYSZPJU76MPHXCEA4JLOQU2/action/replication_record"}},"created_at":"2026-06-03T02:06:06.942052+00:00","updated_at":"2026-06-03T02:06:06.942052+00:00"}