{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:4FJ3YBJJDNDFGB45VKWDT23VRG","short_pith_number":"pith:4FJ3YBJJ","schema_version":"1.0","canonical_sha256":"e153bc05291b4653079daaac39eb75899bcd53acda9c7b02fb8d934ec98e4832","source":{"kind":"arxiv","id":"2604.06750","version":2},"attestation_state":"computed","paper":{"title":"How Well Do Vision-Language Models Understand Sequential Driving Scenes? A Sensitivity Study","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Vision-language models reach only 57% accuracy on sequential driving scenes and fall short of human performance.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Johannes Betz, Mattia Piccinini, Roberto Brusnicki","submitted_at":"2026-04-08T07:14:55Z","abstract_excerpt":"Vision-Language Models (VLMs) are increasingly proposed for autonomous driving tasks, yet their performance on sequential driving scenes remains poorly characterized, particularly regarding how input configurations affect their capabilities. We introduce VENUSS (VLM Evaluation oN Understanding Sequential Scenes), a framework for systematic sensitivity analysis of VLM performance on sequential driving scenes, establishing baselines for future research. Building upon existing datasets, VENUSS extracts temporal sequences from driving videos, and generates structured evaluations across custom cate"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2604.06750","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-04-08T07:14:55Z","cross_cats_sorted":[],"title_canon_sha256":"67f334d6da57a1375949c60affdc945fb1c3272c0f806c570259df75d3a88255","abstract_canon_sha256":"eaae9699e0bf05d1707f27b6b80d1def58d430e7ddedfb235e2797a3263dc91c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T01:05:18.645580Z","signature_b64":"Diu39KWJJkMAaMoVOVWxD6Gyaemnj8jGaOi0dbLyeEXxy9dEIqgMaaIw8M491WPG7buOxLLMtVLPBe5mIBLjDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e153bc05291b4653079daaac39eb75899bcd53acda9c7b02fb8d934ec98e4832","last_reissued_at":"2026-05-21T01:05:18.644949Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T01:05:18.644949Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"How Well Do Vision-Language Models Understand Sequential Driving Scenes? A Sensitivity Study","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Vision-language models reach only 57% accuracy on sequential driving scenes and fall short of human performance.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Johannes Betz, Mattia Piccinini, Roberto Brusnicki","submitted_at":"2026-04-08T07:14:55Z","abstract_excerpt":"Vision-Language Models (VLMs) are increasingly proposed for autonomous driving tasks, yet their performance on sequential driving scenes remains poorly characterized, particularly regarding how input configurations affect their capabilities. We introduce VENUSS (VLM Evaluation oN Understanding Sequential Scenes), a framework for systematic sensitivity analysis of VLM performance on sequential driving scenes, establishing baselines for future research. Building upon existing datasets, VENUSS extracts temporal sequences from driving videos, and generates structured evaluations across custom cate"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"even top models achieve only 57% accuracy, not matching human performance in similar constraints (65%) and exposing significant capability gaps. Our analysis shows that VLMs excel with static object detection but struggle with understanding the vehicle dynamics and temporal relations.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the custom-generated questions and extracted sequences from existing driving videos provide an unbiased and representative test of sequential understanding without introducing artifacts from the extraction or question-generation process.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"VENUSS benchmark shows top VLMs achieve 57% accuracy on sequential driving scenes, strong on static objects but weak on vehicle dynamics and temporal relations.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Vision-language models reach only 57% accuracy on sequential driving scenes and fall short of human performance.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b9cd5434acb5ddc413fe16c9c91959a1fdf5ae80aadc9527b7478867c5c82c53"},"source":{"id":"2604.06750","kind":"arxiv","version":2},"verdict":{"id":"ec25e882-4fb2-45aa-9b80-f2a27ca0f8e7","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-10T18:41:46.404826Z","strongest_claim":"even top models achieve only 57% accuracy, not matching human performance in similar constraints (65%) and exposing significant capability gaps. Our analysis shows that VLMs excel with static object detection but struggle with understanding the vehicle dynamics and temporal relations.","one_line_summary":"VENUSS benchmark shows top VLMs achieve 57% accuracy on sequential driving scenes, strong on static objects but weak on vehicle dynamics and temporal relations.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the custom-generated questions and extracted sequences from existing driving videos provide an unbiased and representative test of sequential understanding without introducing artifacts from the extraction or question-generation process.","pith_extraction_headline":"Vision-language models reach only 57% accuracy on sequential driving scenes and fall short of human performance."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.06750/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2604.06750","created_at":"2026-05-21T01:05:18.645044+00:00"},{"alias_kind":"arxiv_version","alias_value":"2604.06750v2","created_at":"2026-05-21T01:05:18.645044+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.06750","created_at":"2026-05-21T01:05:18.645044+00:00"},{"alias_kind":"pith_short_12","alias_value":"4FJ3YBJJDNDF","created_at":"2026-05-21T01:05:18.645044+00:00"},{"alias_kind":"pith_short_16","alias_value":"4FJ3YBJJDNDFGB45","created_at":"2026-05-21T01:05:18.645044+00:00"},{"alias_kind":"pith_short_8","alias_value":"4FJ3YBJJ","created_at":"2026-05-21T01:05:18.645044+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4FJ3YBJJDNDFGB45VKWDT23VRG","json":"https://pith.science/pith/4FJ3YBJJDNDFGB45VKWDT23VRG.json","graph_json":"https://pith.science/api/pith-number/4FJ3YBJJDNDFGB45VKWDT23VRG/graph.json","events_json":"https://pith.science/api/pith-number/4FJ3YBJJDNDFGB45VKWDT23VRG/events.json","paper":"https://pith.science/paper/4FJ3YBJJ"},"agent_actions":{"view_html":"https://pith.science/pith/4FJ3YBJJDNDFGB45VKWDT23VRG","download_json":"https://pith.science/pith/4FJ3YBJJDNDFGB45VKWDT23VRG.json","view_paper":"https://pith.science/paper/4FJ3YBJJ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2604.06750&json=true","fetch_graph":"https://pith.science/api/pith-number/4FJ3YBJJDNDFGB45VKWDT23VRG/graph.json","fetch_events":"https://pith.science/api/pith-number/4FJ3YBJJDNDFGB45VKWDT23VRG/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4FJ3YBJJDNDFGB45VKWDT23VRG/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4FJ3YBJJDNDFGB45VKWDT23VRG/action/storage_attestation","attest_author":"https://pith.science/pith/4FJ3YBJJDNDFGB45VKWDT23VRG/action/author_attestation","sign_citation":"https://pith.science/pith/4FJ3YBJJDNDFGB45VKWDT23VRG/action/citation_signature","submit_replication":"https://pith.science/pith/4FJ3YBJJDNDFGB45VKWDT23VRG/action/replication_record"}},"created_at":"2026-05-21T01:05:18.645044+00:00","updated_at":"2026-05-21T01:05:18.645044+00:00"}