{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:X6SQH5VVKARAFFARJOYYKTMMB4","short_pith_number":"pith:X6SQH5VV","schema_version":"1.0","canonical_sha256":"bfa503f6b550220294114bb1854d8c0f29b113f40174ed4eb8bde91362a39616","source":{"kind":"arxiv","id":"2605.15205","version":1},"attestation_state":"computed","paper":{"title":"Does Theory of Mind Improvement Really Benefit Human-AI Interactions? Empirical Findings from Interactive Evaluations","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Theory of Mind gains on static benchmarks often fail to improve performance in live human-AI interactions.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Haotian Li, Huamin Qu, Jianxun Lian, Nanxu Gong, Xing Xie, Yanjie Fu, Zishu Zhao, Zixin Chen","submitted_at":"2026-04-28T15:38:31Z","abstract_excerpt":"Improving the Theory of Mind (ToM) capability of Large Language Models (LLMs) is crucial for effective social interactions between these AI models and humans. However, the existing benchmarks often measure ToM capability improvement through story-reading, multiple-choice questions from a third-person perspective, while ignoring the first-person, dynamic, and open-ended nature of human-AI (HAI) interactions. To directly examine how ToM improvement techniques benefit HAI interactions, we first proposed the new paradigm of interactive ToM evaluation with both perspective and metric shifts. Next, "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.15205","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-04-28T15:38:31Z","cross_cats_sorted":[],"title_canon_sha256":"10c0bf16a93488d4d1f19e9b384f14b04a299a1605b3faead5126e5d91b199bf","abstract_canon_sha256":"c918c98312e1feb1ddd4db279bcc109852cd84ac7c9cdb84df646111ef1a46f8"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:00:45.991525Z","signature_b64":"tY47a6fj9QIqIy2dAVf2n29hk9gSAUNH0337mObneuwu/x5r5ijp7GJSqFn1y/J2aWoE91HdyODArI7131pDBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"bfa503f6b550220294114bb1854d8c0f29b113f40174ed4eb8bde91362a39616","last_reissued_at":"2026-05-20T00:00:45.990531Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:00:45.990531Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Does Theory of Mind Improvement Really Benefit Human-AI Interactions? Empirical Findings from Interactive Evaluations","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Theory of Mind gains on static benchmarks often fail to improve performance in live human-AI interactions.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Haotian Li, Huamin Qu, Jianxun Lian, Nanxu Gong, Xing Xie, Yanjie Fu, Zishu Zhao, Zixin Chen","submitted_at":"2026-04-28T15:38:31Z","abstract_excerpt":"Improving the Theory of Mind (ToM) capability of Large Language Models (LLMs) is crucial for effective social interactions between these AI models and humans. However, the existing benchmarks often measure ToM capability improvement through story-reading, multiple-choice questions from a third-person perspective, while ignoring the first-person, dynamic, and open-ended nature of human-AI (HAI) interactions. To directly examine how ToM improvement techniques benefit HAI interactions, we first proposed the new paradigm of interactive ToM evaluation with both perspective and metric shifts. Next, "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Improvements on static benchmarks do not always translate to better performance in dynamic HAI interactions.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The chosen interactive tasks and user-study protocol sufficiently represent the first-person, dynamic, open-ended nature of typical human-AI interactions (section on paradigm shift and evaluation setup).","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Improvements in LLM Theory of Mind on static benchmarks do not reliably improve performance in dynamic, first-person human-AI interactions across goal-oriented and experience-oriented tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Theory of Mind gains on static benchmarks often fail to improve performance in live human-AI interactions.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"8d8108cbf40ae7e77cdb556c7e0adda01fff761a7792bea8639cb74d9fcc3664"},"source":{"id":"2605.15205","kind":"arxiv","version":1},"verdict":{"id":"ec8fb4f1-6e9c-4e3e-a2aa-641d8e66971a","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T17:52:03.442586Z","strongest_claim":"Improvements on static benchmarks do not always translate to better performance in dynamic HAI interactions.","one_line_summary":"Improvements in LLM Theory of Mind on static benchmarks do not reliably improve performance in dynamic, first-person human-AI interactions across goal-oriented and experience-oriented tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The chosen interactive tasks and user-study protocol sufficiently represent the first-person, dynamic, open-ended nature of typical human-AI interactions (section on paradigm shift and evaluation setup).","pith_extraction_headline":"Theory of Mind gains on static benchmarks often fail to improve performance in live human-AI interactions."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.15205/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"doi_compliance","ran_at":"2026-05-19T20:47:13.469270Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"bf8b0fc324be034be6f29c73db8ca27affddee64625a77236f05f068e8d85876"},"references":{"count":63,"sample":[{"doi":"","year":1972,"title":"Aho and Jeffrey D","work_id":"b1f5cb43-a3c7-4ea0-85e7-9ccc9dfe1588","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1983,"title":"Publications Manual , year = \"1983\", publisher =","work_id":"aca2b566-99e0-4ebb-9c7a-a81219531259","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.1145/322234.322243","year":1981,"title":"Chandra and Dexter C","work_id":"c3270592-bd69-4213-95e1-4aaf8312be9b","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Scalable training of","work_id":"aef70eae-f816-4598-84ec-429a2c09f5fc","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1997,"title":"Dan Gusfield , title =. 1997","work_id":"852d89f5-1e7b-4296-b4f2-71e578b5e9f6","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":63,"snapshot_sha256":"b6969a244366a60cae26028b061e36fa1f36b86c593c0562cc6bdaa8be1e34d1","internal_anchors":1},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.15205","created_at":"2026-05-20T00:00:45.990704+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.15205v1","created_at":"2026-05-20T00:00:45.990704+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.15205","created_at":"2026-05-20T00:00:45.990704+00:00"},{"alias_kind":"pith_short_12","alias_value":"X6SQH5VVKARA","created_at":"2026-05-20T00:00:45.990704+00:00"},{"alias_kind":"pith_short_16","alias_value":"X6SQH5VVKARAFFAR","created_at":"2026-05-20T00:00:45.990704+00:00"},{"alias_kind":"pith_short_8","alias_value":"X6SQH5VV","created_at":"2026-05-20T00:00:45.990704+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/X6SQH5VVKARAFFARJOYYKTMMB4","json":"https://pith.science/pith/X6SQH5VVKARAFFARJOYYKTMMB4.json","graph_json":"https://pith.science/api/pith-number/X6SQH5VVKARAFFARJOYYKTMMB4/graph.json","events_json":"https://pith.science/api/pith-number/X6SQH5VVKARAFFARJOYYKTMMB4/events.json","paper":"https://pith.science/paper/X6SQH5VV"},"agent_actions":{"view_html":"https://pith.science/pith/X6SQH5VVKARAFFARJOYYKTMMB4","download_json":"https://pith.science/pith/X6SQH5VVKARAFFARJOYYKTMMB4.json","view_paper":"https://pith.science/paper/X6SQH5VV","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.15205&json=true","fetch_graph":"https://pith.science/api/pith-number/X6SQH5VVKARAFFARJOYYKTMMB4/graph.json","fetch_events":"https://pith.science/api/pith-number/X6SQH5VVKARAFFARJOYYKTMMB4/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/X6SQH5VVKARAFFARJOYYKTMMB4/action/timestamp_anchor","attest_storage":"https://pith.science/pith/X6SQH5VVKARAFFARJOYYKTMMB4/action/storage_attestation","attest_author":"https://pith.science/pith/X6SQH5VVKARAFFARJOYYKTMMB4/action/author_attestation","sign_citation":"https://pith.science/pith/X6SQH5VVKARAFFARJOYYKTMMB4/action/citation_signature","submit_replication":"https://pith.science/pith/X6SQH5VVKARAFFARJOYYKTMMB4/action/replication_record"}},"created_at":"2026-05-20T00:00:45.990704+00:00","updated_at":"2026-05-20T00:00:45.990704+00:00"}