{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:GTA3IY6NGW5TLSLLI24ZGTBL7P","short_pith_number":"pith:GTA3IY6N","schema_version":"1.0","canonical_sha256":"34c1b463cd35bb35c96b46b9934c2bfbd74d97ee571176deefb242b32004774d","source":{"kind":"arxiv","id":"2606.08394","version":1},"attestation_state":"computed","paper":{"title":"When Correct Decisions Hide Internal Stress: Decision-State Probing in Multimodal Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Eduard Hovy, Haoran Zhao, Soyeon Caren Han","submitted_at":"2026-06-07T01:11:36Z","abstract_excerpt":"Multimodal language models are typically evaluated through external behavior: selecting the correct image--text match, rejecting unsupported captions, or answering visual queries correctly. However, correct behavior alone does not show that the model's internal decision state remains stable under controlled semantic stress. We study this gap through S$^3$E (Structured Semantic Stress Evaluation), a framework for analyzing behavior-internal decoupling in multimodal language models. S$^3$E uses a positive-anchored A/B forced-choice setup in which an image-supported caption is contrasted against "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.08394","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-07T01:11:36Z","cross_cats_sorted":[],"title_canon_sha256":"7cdda5abb0ac50a56bf50fb80e380f63349ab186ba186b95e7c5e1944cbb32f4","abstract_canon_sha256":"4226f9db8b8799cea896fabb6e7fa1a3d2d97448e04d397b61f39e8b8edc70ad"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T01:05:35.540047Z","signature_b64":"MrAGZCbt24DDQy/turJmiKkt2murR2XlykCKfzLt6VQCpgEa9oyjlQ0fKxnCk27hQEn8cPXiJA1YvoXKTORaDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"34c1b463cd35bb35c96b46b9934c2bfbd74d97ee571176deefb242b32004774d","last_reissued_at":"2026-06-09T01:05:35.539650Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T01:05:35.539650Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"When Correct Decisions Hide Internal Stress: Decision-State Probing in Multimodal Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Eduard Hovy, Haoran Zhao, Soyeon Caren Han","submitted_at":"2026-06-07T01:11:36Z","abstract_excerpt":"Multimodal language models are typically evaluated through external behavior: selecting the correct image--text match, rejecting unsupported captions, or answering visual queries correctly. However, correct behavior alone does not show that the model's internal decision state remains stable under controlled semantic stress. We study this gap through S$^3$E (Structured Semantic Stress Evaluation), a framework for analyzing behavior-internal decoupling in multimodal language models. S$^3$E uses a positive-anchored A/B forced-choice setup in which an image-supported caption is contrasted against "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.08394","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.08394/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.08394","created_at":"2026-06-09T01:05:35.539711+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.08394v1","created_at":"2026-06-09T01:05:35.539711+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.08394","created_at":"2026-06-09T01:05:35.539711+00:00"},{"alias_kind":"pith_short_12","alias_value":"GTA3IY6NGW5T","created_at":"2026-06-09T01:05:35.539711+00:00"},{"alias_kind":"pith_short_16","alias_value":"GTA3IY6NGW5TLSLL","created_at":"2026-06-09T01:05:35.539711+00:00"},{"alias_kind":"pith_short_8","alias_value":"GTA3IY6N","created_at":"2026-06-09T01:05:35.539711+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/GTA3IY6NGW5TLSLLI24ZGTBL7P","json":"https://pith.science/pith/GTA3IY6NGW5TLSLLI24ZGTBL7P.json","graph_json":"https://pith.science/api/pith-number/GTA3IY6NGW5TLSLLI24ZGTBL7P/graph.json","events_json":"https://pith.science/api/pith-number/GTA3IY6NGW5TLSLLI24ZGTBL7P/events.json","paper":"https://pith.science/paper/GTA3IY6N"},"agent_actions":{"view_html":"https://pith.science/pith/GTA3IY6NGW5TLSLLI24ZGTBL7P","download_json":"https://pith.science/pith/GTA3IY6NGW5TLSLLI24ZGTBL7P.json","view_paper":"https://pith.science/paper/GTA3IY6N","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.08394&json=true","fetch_graph":"https://pith.science/api/pith-number/GTA3IY6NGW5TLSLLI24ZGTBL7P/graph.json","fetch_events":"https://pith.science/api/pith-number/GTA3IY6NGW5TLSLLI24ZGTBL7P/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/GTA3IY6NGW5TLSLLI24ZGTBL7P/action/timestamp_anchor","attest_storage":"https://pith.science/pith/GTA3IY6NGW5TLSLLI24ZGTBL7P/action/storage_attestation","attest_author":"https://pith.science/pith/GTA3IY6NGW5TLSLLI24ZGTBL7P/action/author_attestation","sign_citation":"https://pith.science/pith/GTA3IY6NGW5TLSLLI24ZGTBL7P/action/citation_signature","submit_replication":"https://pith.science/pith/GTA3IY6NGW5TLSLLI24ZGTBL7P/action/replication_record"}},"created_at":"2026-06-09T01:05:35.539711+00:00","updated_at":"2026-06-09T01:05:35.539711+00:00"}