{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:OFKRBFYV3HDMS5LLCYR6VZTITW","short_pith_number":"pith:OFKRBFYV","schema_version":"1.0","canonical_sha256":"7155109715d9c6c9756b1623eae6689dab4bd68961e6aab717bafafbd12bd395","source":{"kind":"arxiv","id":"2503.11832","version":5},"attestation_state":"computed","paper":{"title":"Safety Mirage: How Spurious Correlations Undermine VLM Safety Fine-Tuning and Can Be Mitigated by Machine Unlearning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.AI","authors_text":"Bingquan Shen, Gaowen Liu, Sijia Liu, Yihua Zhang, Yiwei Chen, Yuguang Yao","submitted_at":"2025-03-14T19:52:08Z","abstract_excerpt":"Recent vision language models (VLMs) have made remarkable strides in generative modeling with multimodal inputs, particularly text and images. However, their susceptibility to generating harmful content when exposed to unsafe queries raises critical safety concerns. While current alignment strategies primarily rely on supervised safety fine-tuning with curated datasets, we identify a fundamental limitation we call the ''safety mirage'', where supervised fine-tuning inadvertently reinforces spurious correlations between superficial textual patterns and safety responses, rather than fostering de"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2503.11832","kind":"arxiv","version":5},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-03-14T19:52:08Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"9bf2d73b6469845f0b74ee23d603c55c3b9a55dfa83e62cb8cf9c8b7db3475b1","abstract_canon_sha256":"82e08a533598876f2d2c784265594c656cb1b21d4c47a0992f5965628cc40dbc"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T01:03:30.834149Z","signature_b64":"V7W3kaeEgMI2kV9yhss4WWxi7gcz26C+8YwMaMyDwz+SSl7Lq8vMC7usWKdKgw2DLIlC+ZXGSwsXPPRyU03oDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7155109715d9c6c9756b1623eae6689dab4bd68961e6aab717bafafbd12bd395","last_reissued_at":"2026-06-02T01:03:30.833592Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T01:03:30.833592Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Safety Mirage: How Spurious Correlations Undermine VLM Safety Fine-Tuning and Can Be Mitigated by Machine Unlearning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.AI","authors_text":"Bingquan Shen, Gaowen Liu, Sijia Liu, Yihua Zhang, Yiwei Chen, Yuguang Yao","submitted_at":"2025-03-14T19:52:08Z","abstract_excerpt":"Recent vision language models (VLMs) have made remarkable strides in generative modeling with multimodal inputs, particularly text and images. However, their susceptibility to generating harmful content when exposed to unsafe queries raises critical safety concerns. While current alignment strategies primarily rely on supervised safety fine-tuning with curated datasets, we identify a fundamental limitation we call the ''safety mirage'', where supervised fine-tuning inadvertently reinforces spurious correlations between superficial textual patterns and safety responses, rather than fostering de"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2503.11832","kind":"arxiv","version":5},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2503.11832/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2503.11832","created_at":"2026-06-02T01:03:30.833663+00:00"},{"alias_kind":"arxiv_version","alias_value":"2503.11832v5","created_at":"2026-06-02T01:03:30.833663+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2503.11832","created_at":"2026-06-02T01:03:30.833663+00:00"},{"alias_kind":"pith_short_12","alias_value":"OFKRBFYV3HDM","created_at":"2026-06-02T01:03:30.833663+00:00"},{"alias_kind":"pith_short_16","alias_value":"OFKRBFYV3HDMS5LL","created_at":"2026-06-02T01:03:30.833663+00:00"},{"alias_kind":"pith_short_8","alias_value":"OFKRBFYV","created_at":"2026-06-02T01:03:30.833663+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/OFKRBFYV3HDMS5LLCYR6VZTITW","json":"https://pith.science/pith/OFKRBFYV3HDMS5LLCYR6VZTITW.json","graph_json":"https://pith.science/api/pith-number/OFKRBFYV3HDMS5LLCYR6VZTITW/graph.json","events_json":"https://pith.science/api/pith-number/OFKRBFYV3HDMS5LLCYR6VZTITW/events.json","paper":"https://pith.science/paper/OFKRBFYV"},"agent_actions":{"view_html":"https://pith.science/pith/OFKRBFYV3HDMS5LLCYR6VZTITW","download_json":"https://pith.science/pith/OFKRBFYV3HDMS5LLCYR6VZTITW.json","view_paper":"https://pith.science/paper/OFKRBFYV","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2503.11832&json=true","fetch_graph":"https://pith.science/api/pith-number/OFKRBFYV3HDMS5LLCYR6VZTITW/graph.json","fetch_events":"https://pith.science/api/pith-number/OFKRBFYV3HDMS5LLCYR6VZTITW/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/OFKRBFYV3HDMS5LLCYR6VZTITW/action/timestamp_anchor","attest_storage":"https://pith.science/pith/OFKRBFYV3HDMS5LLCYR6VZTITW/action/storage_attestation","attest_author":"https://pith.science/pith/OFKRBFYV3HDMS5LLCYR6VZTITW/action/author_attestation","sign_citation":"https://pith.science/pith/OFKRBFYV3HDMS5LLCYR6VZTITW/action/citation_signature","submit_replication":"https://pith.science/pith/OFKRBFYV3HDMS5LLCYR6VZTITW/action/replication_record"}},"created_at":"2026-06-02T01:03:30.833663+00:00","updated_at":"2026-06-02T01:03:30.833663+00:00"}