{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:FMJQSJSFFAOLGAFINRTNTVFPHH","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"a2a18e59511c993ddda9d3136987bf9209d89feec0b355f8f79871b9e7e58bd3","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2023-02-08T12:35:34Z","title_canon_sha256":"0af1cd3d0f93626347676b13538faee2652ea3b76a43db3c8090a2df56595df6"},"schema_version":"1.0","source":{"id":"2302.04023","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2302.04023","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2302.04023v4","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2302.04023","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"FMJQSJSFFAOL","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"FMJQSJSFFAOLGAFI","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"FMJQSJSF","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:a7d58eea8118774770134522159c4b9daa12f9542f1dcbca7acf7245a0b21399","target":"graph","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"ChatGPT is 63.41% accurate on average in 10 different reasoning categories under logical reasoning, non-textual reasoning, and commonsense reasoning, hence making it an unreliable reasoner. It is, for example, better at deductive than inductive reasoning."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the 23 chosen datasets, the newly designed multimodal dataset, and the 10 reasoning categories provide a representative and low-bias measure of ChatGPT capabilities without major sensitivity to prompt wording or subjective hallucination labeling."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"ChatGPT outperforms zero-shot LLMs on most tasks and improves with interaction but scores only 63.41 percent on reasoning categories and generates extrinsic hallucinations from its training data."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"ChatGPT averages 63.41% accuracy across ten reasoning categories and improves only modestly with human interaction."}],"snapshot_sha256":"8a814fab06f443108a773eff91b830adb110c6c355702f7556083a8b920e9519"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"883f3c61477ebc15cbf623842d0c9eb96fe1dfb7165d54a3f068e40de2bb238b"},"paper":{"abstract_excerpt":"This paper proposes a framework for quantitatively evaluating interactive LLMs such as ChatGPT using publicly available data sets. We carry out an extensive technical evaluation of ChatGPT using 23 data sets covering 8 different common NLP application tasks. We evaluate the multitask, multilingual and multi-modal aspects of ChatGPT based on these data sets and a newly designed multimodal dataset. We find that ChatGPT outperforms LLMs with zero-shot learning on most tasks and even outperforms fine-tuned models on some tasks. We find that it is better at understanding non-Latin script languages ","authors_text":"Bryan Wilie, Dan Su, Holy Lovenia, Nayeon Lee, Pascale Fung, Quyet V. Do, Samuel Cahyawijaya, Tiezheng Yu, Wenliang Dai, Willy Chung, Yan Xu, Yejin Bang, Ziwei Ji","cross_cats":["cs.AI"],"headline":"ChatGPT averages 63.41% accuracy across ten reasoning categories and improves only modestly with human interaction.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2023-02-08T12:35:34Z","title":"A Multitask, Multilingual, Multimodal Evaluation of ChatGPT on Reasoning, Hallucination, and Interactivity"},"references":{"count":23,"internal_anchors":2,"resolved_work":23,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"News summarization and evaluation in the era of gpt-3","work_id":"ec749274-c88c-461b-8bea-cc527ad5c047","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pages 7890–7900","work_id":"c87a4363-a07c-404a-ac90-4bc9bab7e033","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Qa dataset explosion: A taxonomy of nlp resources for question answering and reading com- prehension. ACM Comput. Surv. Just Accepted. Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, ","work_id":"34ef0ae1-f5d9-4272-93e7-9deb55e86050","year":2022},{"cited_arxiv_id":"2206.04615","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models","work_id":"bb63abb3-0d50-4362-b97c-b5e725b03b39","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Richmond Thomason","work_id":"0d93f796-a6ff-4975-9a9c-56e6eb348e66","year":2018}],"snapshot_sha256":"ffbaf2a528ad96be45f30d139f4641b9f5a1f8d4f1fdc12b4bc353ff7f971730"},"source":{"id":"2302.04023","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-17T19:53:52.885799Z","id":"57f14e71-d1ef-47f9-a0e5-6d9394914ccb","model_set":{"reader":"grok-4.3"},"one_line_summary":"ChatGPT outperforms zero-shot LLMs on most tasks and improves with interaction but scores only 63.41 percent on reasoning categories and generates extrinsic hallucinations from its training data.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"ChatGPT averages 63.41% accuracy across ten reasoning categories and improves only modestly with human interaction.","strongest_claim":"ChatGPT is 63.41% accurate on average in 10 different reasoning categories under logical reasoning, non-textual reasoning, and commonsense reasoning, hence making it an unreliable reasoner. It is, for example, better at deductive than inductive reasoning.","weakest_assumption":"That the 23 chosen datasets, the newly designed multimodal dataset, and the 10 reasoning categories provide a representative and low-bias measure of ChatGPT capabilities without major sensitivity to prompt wording or subjective hallucination labeling."}},"verdict_id":"57f14e71-d1ef-47f9-a0e5-6d9394914ccb"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:3c05e2edff9433e741e39db78d4c89aa0fefcffef6e007006a76c321d61ec718","target":"record","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"a2a18e59511c993ddda9d3136987bf9209d89feec0b355f8f79871b9e7e58bd3","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2023-02-08T12:35:34Z","title_canon_sha256":"0af1cd3d0f93626347676b13538faee2652ea3b76a43db3c8090a2df56595df6"},"schema_version":"1.0","source":{"id":"2302.04023","kind":"arxiv","version":4}},"canonical_sha256":"2b13092645281cb300a86c66d9d4af39df07b806108f271b30a4904d70721687","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"2b13092645281cb300a86c66d9d4af39df07b806108f271b30a4904d70721687","first_computed_at":"2026-05-17T23:38:13.237057Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:13.237057Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"IThoSBAEf0JFMRGF+x7vrCK5spKbYgezHYGh9a9BRPafXpogQJvnAAXvgPpLOJoTQaQMJUf8aLVa7wbMbhbyCw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:13.237681Z","signed_message":"canonical_sha256_bytes"},"source_id":"2302.04023","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:3c05e2edff9433e741e39db78d4c89aa0fefcffef6e007006a76c321d61ec718","sha256:a7d58eea8118774770134522159c4b9daa12f9542f1dcbca7acf7245a0b21399"],"state_sha256":"c1dbe9c41cc760400011a2a696c37c4d55fe4bba918963d30995079b4a73935a"}