{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:WUMKX3E3SHNFAAQH7HL7VVC4CD","short_pith_number":"pith:WUMKX3E3","canonical_record":{"source":{"id":"2605.12987","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T04:36:04Z","cross_cats_sorted":[],"title_canon_sha256":"46a4ca994abd6fde2c1d861045d5119b643ffa29015b1ae649ef3b7c722dbd5a","abstract_canon_sha256":"95c0c5d51065316becd12dcbeeb23978a1645a1979cdaef4829b7cd19aaa2bb8"},"schema_version":"1.0"},"canonical_sha256":"b518abec9b91da500207f9d7fad45c10dd675a2766dd630a7557ff09dcd3f165","source":{"kind":"arxiv","id":"2605.12987","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12987","created_at":"2026-05-18T03:09:00Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12987v1","created_at":"2026-05-18T03:09:00Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12987","created_at":"2026-05-18T03:09:00Z"},{"alias_kind":"pith_short_12","alias_value":"WUMKX3E3SHNF","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WUMKX3E3SHNFAAQH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WUMKX3E3","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:WUMKX3E3SHNFAAQH7HL7VVC4CD","target":"record","payload":{"canonical_record":{"source":{"id":"2605.12987","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T04:36:04Z","cross_cats_sorted":[],"title_canon_sha256":"46a4ca994abd6fde2c1d861045d5119b643ffa29015b1ae649ef3b7c722dbd5a","abstract_canon_sha256":"95c0c5d51065316becd12dcbeeb23978a1645a1979cdaef4829b7cd19aaa2bb8"},"schema_version":"1.0"},"canonical_sha256":"b518abec9b91da500207f9d7fad45c10dd675a2766dd630a7557ff09dcd3f165","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:09:00.585361Z","signature_b64":"/lSALtTp7vvfcG1bFxrMK2rQkm7vgQHE1j1pwzHq8qtyH0ia61G5eRBX3oFsR4cWoRNiig8JefjeUJLjTJ9mBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b518abec9b91da500207f9d7fad45c10dd675a2766dd630a7557ff09dcd3f165","last_reissued_at":"2026-05-18T03:09:00.584598Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:09:00.584598Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.12987","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:00Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"8PqQS57/vQJzqpeBhqIkLGnEWhpGFhAaTZfteBKw3uDWswGotpYEO1a7uJUtxG/MoCYQmVoa8H9ZoTZOqqgbCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-04T13:09:48.179965Z"},"content_sha256":"5bc8ad34b49906d300c310020eb5be48217155983a659117984ef384f68aa233","schema_version":"1.0","event_id":"sha256:5bc8ad34b49906d300c310020eb5be48217155983a659117984ef384f68aa233"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:WUMKX3E3SHNFAAQH7HL7VVC4CD","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Leveraging Multimodal Self-Consistency Reasoning in Coding Motivational Interviewing for Alcohol Use Reduction","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A multimodal self-consistency method using audio-language models codes motivational interviewing sessions more accurately than single-pass approaches.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Benjamin O. Ladd, Brian Borsari, Guangzeng Han, James G. Murphy, Xiaolei Huang","submitted_at":"2026-05-13T04:36:04Z","abstract_excerpt":"BACKGROUND: Coding Motivational Interviewing (MI) sessions is essential for understanding client behaviors and predicting outcomes, but it requires substantial time and labor from trained MI professionals. Recent advances in audio-language models (ALMs) offer new opportunities to automate MI coding by capturing multimodal behavioral signals. OBJECTIVE: This study aims to develop an automatic MI coding approach based on ALMs that analyzes raw audio input and integrates predictions from multiple reasoning trajectories using self-consistency to improve coding robustness. METHODS: We experimented "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"The proposed multimodal self-consistency approach achieved 52.56% accuracy, 54.03% precision, 47.45% recall, and a macro-F1 score of 46.40%, exceeding baseline methods.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the five de-identified MI audio tapes are representative of typical sessions and that majority voting across the twelve trajectories reliably improves accuracy without introducing systematic bias from the chosen prompts or model stochasticity.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Multimodal self-consistency reasoning with audio-language models reaches 52.56% accuracy on coding five MI sessions, outperforming single-pass baselines.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A multimodal self-consistency method using audio-language models codes motivational interviewing sessions more accurately than single-pass approaches.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"eece4c0590485968ae301b74c02b7e7e8951da0e1e4b31a4980e5712201d94da"},"source":{"id":"2605.12987","kind":"arxiv","version":1},"verdict":{"id":"af20e163-9197-4ffd-ad67-ed723435c0de","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T20:10:11.412779Z","strongest_claim":"The proposed multimodal self-consistency approach achieved 52.56% accuracy, 54.03% precision, 47.45% recall, and a macro-F1 score of 46.40%, exceeding baseline methods.","one_line_summary":"Multimodal self-consistency reasoning with audio-language models reaches 52.56% accuracy on coding five MI sessions, outperforming single-pass baselines.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the five de-identified MI audio tapes are representative of typical sessions and that majority voting across the twelve trajectories reliably improves accuracy without introducing systematic bias from the chosen prompts or model stochasticity.","pith_extraction_headline":"A multimodal self-consistency method using audio-language models codes motivational interviewing sessions more accurately than single-pass approaches."},"references":{"count":22,"sample":[{"doi":"","year":2025,"title":"Funding National Science Foundation CNS-2318210 and TI-2434589","work_id":"65ba86a0-5967-4b65-bdda-a98e43efab14","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"BMC psychiatry , volume=","work_id":"4298b5cc-4987-48ad-bb9a-19fde5dd19da","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"M 3 TCM : Multi-modal Multi-task Context Model for Utterance Classification in Motivational Interviews","work_id":"c633a6c7-c977-4690-955a-bce6444d45a5","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Advances in neural information processing systems , volume=","work_id":"1265447d-0324-4d07-abba-34fa29d172da","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Multimodal audio-language model for speech emotion recognition , author=. 2024 , publisher=","work_id":"6691a798-5e8c-4d76-9442-805805b6de45","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":22,"snapshot_sha256":"07656b4c7fe737a864a7b070aa1eed9536053ee838b16ba73bbec6a33a60a1f6","internal_anchors":1},"formal_canon":{"evidence_count":1,"snapshot_sha256":"06722158045f324bf0d331d8783c7807fd0ef765ba38a40a8719456bb3715725"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"af20e163-9197-4ffd-ad67-ed723435c0de"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:00Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Ljren46PD1ujgCuvNvmvaqBb6TxjM2I96kRKzXdedKi9gM6Q2WlP8oa5nKVUZMT9xmrzBBob8Pemdahy4p30Cw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-04T13:09:48.180501Z"},"content_sha256":"fb037d5ecbd1d198b8937d317bbab61e56508887fd0527128c216bd9558eb5e2","schema_version":"1.0","event_id":"sha256:fb037d5ecbd1d198b8937d317bbab61e56508887fd0527128c216bd9558eb5e2"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/WUMKX3E3SHNFAAQH7HL7VVC4CD/bundle.json","state_url":"https://pith.science/pith/WUMKX3E3SHNFAAQH7HL7VVC4CD/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/WUMKX3E3SHNFAAQH7HL7VVC4CD/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-04T13:09:48Z","links":{"resolver":"https://pith.science/pith/WUMKX3E3SHNFAAQH7HL7VVC4CD","bundle":"https://pith.science/pith/WUMKX3E3SHNFAAQH7HL7VVC4CD/bundle.json","state":"https://pith.science/pith/WUMKX3E3SHNFAAQH7HL7VVC4CD/state.json","well_known_bundle":"https://pith.science/.well-known/pith/WUMKX3E3SHNFAAQH7HL7VVC4CD/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:WUMKX3E3SHNFAAQH7HL7VVC4CD","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"95c0c5d51065316becd12dcbeeb23978a1645a1979cdaef4829b7cd19aaa2bb8","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T04:36:04Z","title_canon_sha256":"46a4ca994abd6fde2c1d861045d5119b643ffa29015b1ae649ef3b7c722dbd5a"},"schema_version":"1.0","source":{"id":"2605.12987","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12987","created_at":"2026-05-18T03:09:00Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12987v1","created_at":"2026-05-18T03:09:00Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12987","created_at":"2026-05-18T03:09:00Z"},{"alias_kind":"pith_short_12","alias_value":"WUMKX3E3SHNF","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WUMKX3E3SHNFAAQH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WUMKX3E3","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:fb037d5ecbd1d198b8937d317bbab61e56508887fd0527128c216bd9558eb5e2","target":"graph","created_at":"2026-05-18T03:09:00Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"The proposed multimodal self-consistency approach achieved 52.56% accuracy, 54.03% precision, 47.45% recall, and a macro-F1 score of 46.40%, exceeding baseline methods."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the five de-identified MI audio tapes are representative of typical sessions and that majority voting across the twelve trajectories reliably improves accuracy without introducing systematic bias from the chosen prompts or model stochasticity."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Multimodal self-consistency reasoning with audio-language models reaches 52.56% accuracy on coding five MI sessions, outperforming single-pass baselines."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A multimodal self-consistency method using audio-language models codes motivational interviewing sessions more accurately than single-pass approaches."}],"snapshot_sha256":"eece4c0590485968ae301b74c02b7e7e8951da0e1e4b31a4980e5712201d94da"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"06722158045f324bf0d331d8783c7807fd0ef765ba38a40a8719456bb3715725"},"paper":{"abstract_excerpt":"BACKGROUND: Coding Motivational Interviewing (MI) sessions is essential for understanding client behaviors and predicting outcomes, but it requires substantial time and labor from trained MI professionals. Recent advances in audio-language models (ALMs) offer new opportunities to automate MI coding by capturing multimodal behavioral signals. OBJECTIVE: This study aims to develop an automatic MI coding approach based on ALMs that analyzes raw audio input and integrates predictions from multiple reasoning trajectories using self-consistency to improve coding robustness. METHODS: We experimented ","authors_text":"Benjamin O. Ladd, Brian Borsari, Guangzeng Han, James G. Murphy, Xiaolei Huang","cross_cats":[],"headline":"A multimodal self-consistency method using audio-language models codes motivational interviewing sessions more accurately than single-pass approaches.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T04:36:04Z","title":"Leveraging Multimodal Self-Consistency Reasoning in Coding Motivational Interviewing for Alcohol Use Reduction"},"references":{"count":22,"internal_anchors":1,"resolved_work":22,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Funding National Science Foundation CNS-2318210 and TI-2434589","work_id":"65ba86a0-5967-4b65-bdda-a98e43efab14","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"BMC psychiatry , volume=","work_id":"4298b5cc-4987-48ad-bb9a-19fde5dd19da","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"M 3 TCM : Multi-modal Multi-task Context Model for Utterance Classification in Motivational Interviews","work_id":"c633a6c7-c977-4690-955a-bce6444d45a5","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Advances in neural information processing systems , volume=","work_id":"1265447d-0324-4d07-abba-34fa29d172da","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Multimodal audio-language model for speech emotion recognition , author=. 2024 , publisher=","work_id":"6691a798-5e8c-4d76-9442-805805b6de45","year":2024}],"snapshot_sha256":"07656b4c7fe737a864a7b070aa1eed9536053ee838b16ba73bbec6a33a60a1f6"},"source":{"id":"2605.12987","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T20:10:11.412779Z","id":"af20e163-9197-4ffd-ad67-ed723435c0de","model_set":{"reader":"grok-4.3"},"one_line_summary":"Multimodal self-consistency reasoning with audio-language models reaches 52.56% accuracy on coding five MI sessions, outperforming single-pass baselines.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A multimodal self-consistency method using audio-language models codes motivational interviewing sessions more accurately than single-pass approaches.","strongest_claim":"The proposed multimodal self-consistency approach achieved 52.56% accuracy, 54.03% precision, 47.45% recall, and a macro-F1 score of 46.40%, exceeding baseline methods.","weakest_assumption":"That the five de-identified MI audio tapes are representative of typical sessions and that majority voting across the twelve trajectories reliably improves accuracy without introducing systematic bias from the chosen prompts or model stochasticity."}},"verdict_id":"af20e163-9197-4ffd-ad67-ed723435c0de"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:5bc8ad34b49906d300c310020eb5be48217155983a659117984ef384f68aa233","target":"record","created_at":"2026-05-18T03:09:00Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"95c0c5d51065316becd12dcbeeb23978a1645a1979cdaef4829b7cd19aaa2bb8","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T04:36:04Z","title_canon_sha256":"46a4ca994abd6fde2c1d861045d5119b643ffa29015b1ae649ef3b7c722dbd5a"},"schema_version":"1.0","source":{"id":"2605.12987","kind":"arxiv","version":1}},"canonical_sha256":"b518abec9b91da500207f9d7fad45c10dd675a2766dd630a7557ff09dcd3f165","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b518abec9b91da500207f9d7fad45c10dd675a2766dd630a7557ff09dcd3f165","first_computed_at":"2026-05-18T03:09:00.584598Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:09:00.584598Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"/lSALtTp7vvfcG1bFxrMK2rQkm7vgQHE1j1pwzHq8qtyH0ia61G5eRBX3oFsR4cWoRNiig8JefjeUJLjTJ9mBQ==","signature_status":"signed_v1","signed_at":"2026-05-18T03:09:00.585361Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.12987","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:5bc8ad34b49906d300c310020eb5be48217155983a659117984ef384f68aa233","sha256:fb037d5ecbd1d198b8937d317bbab61e56508887fd0527128c216bd9558eb5e2"],"state_sha256":"54e988a10fdbb61e330cd981c2b80241272916128d79674cf3435cd706b1952c"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"/p9Bhr/9H35+UBWgmHxUaen2UG0/35I7+VkEWGfLMWxsgd3O3xHh8sc9SIQ//NM6U52Rc6ID5Chca3VtitfDDQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-04T13:09:48.182815Z","bundle_sha256":"6bc480b0daf2803ffc54daf1f37948a8f2887afb9b320511e857b60f037e8ed2"}}