{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:FATVKK4HTZWSNJJ5FZKEMV3YZE","short_pith_number":"pith:FATVKK4H","schema_version":"1.0","canonical_sha256":"2827552b879e6d26a53d2e54465778c939b01dd1c1ad41eb18d3b8e493aca998","source":{"kind":"arxiv","id":"2603.18178","version":2},"attestation_state":"computed","paper":{"title":"VLM-AutoDrive: Post-Training Vision-Language Models for Safety-Critical Autonomous Driving Events","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Hao Wang, John Kenyon, Kevin Xie, Michael Woods, Ming-Yu Liu, Mohammad Qazim Bhat, Niket Agarwal, Tsung-Yi Lin, Xiaodong Yang, Yufan Huang","submitted_at":"2026-03-18T18:23:34Z","abstract_excerpt":"The rapid growth of ego-centric dashcam footage presents a major challenge for detecting safety-critical events such as collisions and near-collisions, scenarios that are brief, rare, and difficult for generic vision models to capture. While multimodal large language models (MLLMs) demonstrate strong general reasoning ability, they underperform in driving contexts due to domain and temporal misalignment.\n  We introduce VLM-AutoDrive, a modular post-training framework for adapting pretrained Vision-Language Models (VLMs) to high-fidelity anomaly detection. The framework integrates metadata-deri"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2603.18178","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-03-18T18:23:34Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"29031d11ca51954ef367d7a034eb9afcf864f4a67147ff48e1962a56265590dd","abstract_canon_sha256":"692a830b6fdf3d4457473b6e53b400a58ca44c55601a319d8b4551806f38bc62"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:02:10.652415Z","signature_b64":"HuuyXsk5ZPkLhIkmPntz3+Bc0fPvTv4ZLXiXZkTWgtNh964TgvQ1J2GEHeKn0GebJVqqP/4viREszD9Ypk0kDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2827552b879e6d26a53d2e54465778c939b01dd1c1ad41eb18d3b8e493aca998","last_reissued_at":"2026-05-20T00:02:10.651592Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:02:10.651592Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"VLM-AutoDrive: Post-Training Vision-Language Models for Safety-Critical Autonomous Driving Events","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Hao Wang, John Kenyon, Kevin Xie, Michael Woods, Ming-Yu Liu, Mohammad Qazim Bhat, Niket Agarwal, Tsung-Yi Lin, Xiaodong Yang, Yufan Huang","submitted_at":"2026-03-18T18:23:34Z","abstract_excerpt":"The rapid growth of ego-centric dashcam footage presents a major challenge for detecting safety-critical events such as collisions and near-collisions, scenarios that are brief, rare, and difficult for generic vision models to capture. While multimodal large language models (MLLMs) demonstrate strong general reasoning ability, they underperform in driving contexts due to domain and temporal misalignment.\n  We introduce VLM-AutoDrive, a modular post-training framework for adapting pretrained Vision-Language Models (VLMs) to high-fidelity anomaly detection. The framework integrates metadata-deri"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.18178","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2603.18178/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2603.18178","created_at":"2026-05-20T00:02:10.651729+00:00"},{"alias_kind":"arxiv_version","alias_value":"2603.18178v2","created_at":"2026-05-20T00:02:10.651729+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.18178","created_at":"2026-05-20T00:02:10.651729+00:00"},{"alias_kind":"pith_short_12","alias_value":"FATVKK4HTZWS","created_at":"2026-05-20T00:02:10.651729+00:00"},{"alias_kind":"pith_short_16","alias_value":"FATVKK4HTZWSNJJ5","created_at":"2026-05-20T00:02:10.651729+00:00"},{"alias_kind":"pith_short_8","alias_value":"FATVKK4H","created_at":"2026-05-20T00:02:10.651729+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.21917","citing_title":"MAVEN: A Multi-stage Agentic Annotation Pipeline for Video Reasoning Tasks","ref_index":6,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/FATVKK4HTZWSNJJ5FZKEMV3YZE","json":"https://pith.science/pith/FATVKK4HTZWSNJJ5FZKEMV3YZE.json","graph_json":"https://pith.science/api/pith-number/FATVKK4HTZWSNJJ5FZKEMV3YZE/graph.json","events_json":"https://pith.science/api/pith-number/FATVKK4HTZWSNJJ5FZKEMV3YZE/events.json","paper":"https://pith.science/paper/FATVKK4H"},"agent_actions":{"view_html":"https://pith.science/pith/FATVKK4HTZWSNJJ5FZKEMV3YZE","download_json":"https://pith.science/pith/FATVKK4HTZWSNJJ5FZKEMV3YZE.json","view_paper":"https://pith.science/paper/FATVKK4H","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2603.18178&json=true","fetch_graph":"https://pith.science/api/pith-number/FATVKK4HTZWSNJJ5FZKEMV3YZE/graph.json","fetch_events":"https://pith.science/api/pith-number/FATVKK4HTZWSNJJ5FZKEMV3YZE/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/FATVKK4HTZWSNJJ5FZKEMV3YZE/action/timestamp_anchor","attest_storage":"https://pith.science/pith/FATVKK4HTZWSNJJ5FZKEMV3YZE/action/storage_attestation","attest_author":"https://pith.science/pith/FATVKK4HTZWSNJJ5FZKEMV3YZE/action/author_attestation","sign_citation":"https://pith.science/pith/FATVKK4HTZWSNJJ5FZKEMV3YZE/action/citation_signature","submit_replication":"https://pith.science/pith/FATVKK4HTZWSNJJ5FZKEMV3YZE/action/replication_record"}},"created_at":"2026-05-20T00:02:10.651729+00:00","updated_at":"2026-05-20T00:02:10.651729+00:00"}