{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:ODHAUUJ4W6KXXLMBS5A5DX6NNE","short_pith_number":"pith:ODHAUUJ4","canonical_record":{"source":{"id":"2502.14739","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/publicdomain/zero/1.0/","primary_cat":"cs.CL","submitted_at":"2025-02-20T17:05:58Z","cross_cats_sorted":[],"title_canon_sha256":"3d35ab16412f2a2d744cf7d1cdecfbf17234113ad2edc0b1e829f639d42a3ab9","abstract_canon_sha256":"5e4221a4235efe16896596b5e18106ddc45ba53a8f135b2e1478ccf4344aabd6"},"schema_version":"1.0"},"canonical_sha256":"70ce0a513cb7957bad819741d1dfcd6919d880547dbe7becf8ab4e2b15317b7d","source":{"kind":"arxiv","id":"2502.14739","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2502.14739","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2502.14739v4","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.14739","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"ODHAUUJ4W6KX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"ODHAUUJ4W6KXXLMB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"ODHAUUJ4","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:ODHAUUJ4W6KXXLMBS5A5DX6NNE","target":"record","payload":{"canonical_record":{"source":{"id":"2502.14739","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/publicdomain/zero/1.0/","primary_cat":"cs.CL","submitted_at":"2025-02-20T17:05:58Z","cross_cats_sorted":[],"title_canon_sha256":"3d35ab16412f2a2d744cf7d1cdecfbf17234113ad2edc0b1e829f639d42a3ab9","abstract_canon_sha256":"5e4221a4235efe16896596b5e18106ddc45ba53a8f135b2e1478ccf4344aabd6"},"schema_version":"1.0"},"canonical_sha256":"70ce0a513cb7957bad819741d1dfcd6919d880547dbe7becf8ab4e2b15317b7d","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.563452Z","signature_b64":"uT4vp/Tg9p9YuSVEOptfo3oYZLY+wYyFFtDk8+O5WxN0I8JlcQ3fhAggBu7y3odDYwcZP7ItKEELgBEQfAO+Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"70ce0a513cb7957bad819741d1dfcd6919d880547dbe7becf8ab4e2b15317b7d","last_reissued_at":"2026-05-17T23:38:49.562806Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.562806Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2502.14739","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"I7GiAeFq+0xMPb0oZXhGp18osq3kKgu6mERfLKGoP5Xl3untzgpyERZISAUY7ueyOUe+ociS5SvKLylkKVchBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T11:47:33.727245Z"},"content_sha256":"d634f35255dbe1e7d6e46534b132e7c2de49e808c7b64e323980df2d1c4e79b5","schema_version":"1.0","event_id":"sha256:d634f35255dbe1e7d6e46534b132e7c2de49e808c7b64e323980df2d1c4e79b5"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:ODHAUUJ4W6KXXLMBS5A5DX6NNE","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"SuperGPQA: Scaling LLM Evaluation across 285 Graduate Disciplines","license":"http://creativecommons.org/publicdomain/zero/1.0/","headline":"SuperGPQA benchmark shows top LLMs reach only 61.82 percent accuracy across 285 graduate disciplines.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Bingli Wang, Chengdong Lin, Chenghua Zhong, Chenglin Cai, Chengtuo Cheng, Chenqing Wang, Chujie Zheng, Chun Zhang, David Ma, Dayiheng Liu, Ge Zhang, Guoyin Wang, Haoran Que, Hao Wang, Hongquan Lin, Jiaheng Liu, Jiajun Xu, Jian Yang, Jinyang Zhang, Junran Peng, Junting Zhou, Kaijing Ma, Kaixin Deng, Kexin Yang, Keyi Ding, King Zhu, Liang Chen, M-A-P Team, Meng Cao, Minghao Liu, Ming Xu, Min Yang, Qian Liu, Qige Qi, Qinrui Li, Qiyao Wang, Qunshu Lin, Ruibin Yuan, Rui Li, Shanghaoran Quan, Shawn Gavin, Shian Jia, Shi Qiu, Shi Wang, Shiwen Ni, Sichao Jiang, Siming Huang, Sirun Li, Siwei Wu, Tianhao Cheng, Tianhao Liang, Tianyang Pang, Tianyang Zhan, Tianyu Liu, Tianyu Zheng, Tyshawn Hsing, Wangchunshu Zhou, Wenbo Su, Wenhao Huang, Xiang Yue, Xiangyu Zheng, Xiaolong Jin, Xingjian Zhang, Xingwei Qu, Xingyuan Bu, Xinrun Du, Xiyue Zhang, Yang Gao, Yaoru Li, Yifan Chen, Yifan Yao, Yiming Liang, Yinghao Ma, Yiyan Liao, Yiya Wang, Yizhe Li, Yizhi Li, Yizhou Tan, Yongchi Zhao, Yuanhao Yue, Yuansheng Ni, Yubo Wang, Yuelin Bai, Yue Zhang, Yujia Qin, Yun Huang, Yunwen Li, Zekun Moore Wang, Zhaoqun Li, Zhaoxiang Zhang, Zhenlin Wei, Zhenzhu Yang, Zhongyuan Peng, Zhoufutu Wen, Zhoujun Li, Zifan Peng, Zili Wang","submitted_at":"2025-02-20T17:05:58Z","abstract_excerpt":"Large language models (LLMs) have demonstrated remarkable proficiency in mainstream academic disciplines such as mathematics, physics, and computer science. However, human knowledge encompasses over 200 specialized disciplines, far exceeding the scope of existing benchmarks. The capabilities of LLMs in many of these specialized fields-particularly in light industry, agriculture, and service-oriented disciplines-remain inadequately evaluated. To address this gap, we present SuperGPQA, a comprehensive benchmark that evaluates graduate-level knowledge and reasoning capabilities across 285 discipl"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our experimental results reveal significant room for improvement in the performance of current state-of-the-art LLMs across diverse knowledge domains (e.g., the reasoning-focused model DeepSeek-R1 achieved the highest accuracy of 61.82% on SuperGPQA), highlighting the considerable gap between current model capabilities and artificial general intelligence.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that the Human-LLM collaborative filtering process produces questions that are genuinely graduate-level, unambiguous, and representative of each discipline without introducing selection bias or over-filtering difficult items.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SuperGPQA is a new benchmark that tests LLMs on graduate questions from 285 disciplines after human-LLM filtering, with current best models scoring 61.82 percent.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"SuperGPQA benchmark shows top LLMs reach only 61.82 percent accuracy across 285 graduate disciplines.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"e3d3e6348f6b9bf142118ad950ca4109a3f58bf2af5bc648ad85092317d57906"},"source":{"id":"2502.14739","kind":"arxiv","version":4},"verdict":{"id":"75de24d0-ec88-4bf6-8e9e-706c6830f652","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T00:41:53.807346Z","strongest_claim":"Our experimental results reveal significant room for improvement in the performance of current state-of-the-art LLMs across diverse knowledge domains (e.g., the reasoning-focused model DeepSeek-R1 achieved the highest accuracy of 61.82% on SuperGPQA), highlighting the considerable gap between current model capabilities and artificial general intelligence.","one_line_summary":"SuperGPQA is a new benchmark that tests LLMs on graduate questions from 285 disciplines after human-LLM filtering, with current best models scoring 61.82 percent.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that the Human-LLM collaborative filtering process produces questions that are genuinely graduate-level, unambiguous, and representative of each discipline without introducing selection bias or over-filtering difficult items.","pith_extraction_headline":"SuperGPQA benchmark shows top LLMs reach only 61.82 percent accuracy across 285 graduate disciplines."},"references":{"count":121,"sample":[{"doi":"10.48550/arxiv.2412.03205","year":2024,"title":"U-math: A university-level benchmark for evaluating mathematical skills in llms","work_id":"4fe452e9-89bf-48b7-9ab9-743ed3f4e445","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.18653/v1/d18-1259","year":2024,"title":"Yi: Open Foundation Models by 01.AI","work_id":"8efee8a1-5e3c-4851-9c65-18e3d1d9e769","ref_index":2,"cited_arxiv_id":"2403.04652","is_internal_anchor":true},{"doi":"","year":null,"title":"According to Danto’s definition, context is an art world with modern aspects","work_id":"ec9df59e-9f2c-47c3-ba2a-70421ba7c367","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"“La Bayadère” is a ballet created during the French July Revolution","work_id":"efed4f45-0889-4bb3-8be6-dda7e0f5a4f9","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"The ballet “Sylvia” is a dance drama created during the Paris Commune period in 1871","work_id":"f96be600-e5fe-42e5-ab21-d6b456ec0256","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":121,"snapshot_sha256":"370c1cd06c40c112bb0dc325c51cae27d5120fdd3b7f7d5d1780f7d900e7fc61","internal_anchors":1},"formal_canon":{"evidence_count":1,"snapshot_sha256":"da884ea85f9410c8b2816ccd15462e36129fc28f8f324243e98ba7c81293b7d9"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"75de24d0-ec88-4bf6-8e9e-706c6830f652"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Tt1rieUPjQYUx0PpCO+x5438ZtSjLcSclZmSewhqINiQ7KCBvkYcjVfDDop5EA4XFzsMcM0/8zq/8q4vSG/2AQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T11:47:33.728651Z"},"content_sha256":"7eb6d4826d46aecc35d3a4e04281c3188fe98f9e673953fb03bff5a486deb52a","schema_version":"1.0","event_id":"sha256:7eb6d4826d46aecc35d3a4e04281c3188fe98f9e673953fb03bff5a486deb52a"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/ODHAUUJ4W6KXXLMBS5A5DX6NNE/bundle.json","state_url":"https://pith.science/pith/ODHAUUJ4W6KXXLMBS5A5DX6NNE/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/ODHAUUJ4W6KXXLMBS5A5DX6NNE/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-10T11:47:33Z","links":{"resolver":"https://pith.science/pith/ODHAUUJ4W6KXXLMBS5A5DX6NNE","bundle":"https://pith.science/pith/ODHAUUJ4W6KXXLMBS5A5DX6NNE/bundle.json","state":"https://pith.science/pith/ODHAUUJ4W6KXXLMBS5A5DX6NNE/state.json","well_known_bundle":"https://pith.science/.well-known/pith/ODHAUUJ4W6KXXLMBS5A5DX6NNE/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:ODHAUUJ4W6KXXLMBS5A5DX6NNE","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5e4221a4235efe16896596b5e18106ddc45ba53a8f135b2e1478ccf4344aabd6","cross_cats_sorted":[],"license":"http://creativecommons.org/publicdomain/zero/1.0/","primary_cat":"cs.CL","submitted_at":"2025-02-20T17:05:58Z","title_canon_sha256":"3d35ab16412f2a2d744cf7d1cdecfbf17234113ad2edc0b1e829f639d42a3ab9"},"schema_version":"1.0","source":{"id":"2502.14739","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2502.14739","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2502.14739v4","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.14739","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"ODHAUUJ4W6KX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"ODHAUUJ4W6KXXLMB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"ODHAUUJ4","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:7eb6d4826d46aecc35d3a4e04281c3188fe98f9e673953fb03bff5a486deb52a","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Our experimental results reveal significant room for improvement in the performance of current state-of-the-art LLMs across diverse knowledge domains (e.g., the reasoning-focused model DeepSeek-R1 achieved the highest accuracy of 61.82% on SuperGPQA), highlighting the considerable gap between current model capabilities and artificial general intelligence."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that the Human-LLM collaborative filtering process produces questions that are genuinely graduate-level, unambiguous, and representative of each discipline without introducing selection bias or over-filtering difficult items."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SuperGPQA is a new benchmark that tests LLMs on graduate questions from 285 disciplines after human-LLM filtering, with current best models scoring 61.82 percent."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"SuperGPQA benchmark shows top LLMs reach only 61.82 percent accuracy across 285 graduate disciplines."}],"snapshot_sha256":"e3d3e6348f6b9bf142118ad950ca4109a3f58bf2af5bc648ad85092317d57906"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"da884ea85f9410c8b2816ccd15462e36129fc28f8f324243e98ba7c81293b7d9"},"paper":{"abstract_excerpt":"Large language models (LLMs) have demonstrated remarkable proficiency in mainstream academic disciplines such as mathematics, physics, and computer science. However, human knowledge encompasses over 200 specialized disciplines, far exceeding the scope of existing benchmarks. The capabilities of LLMs in many of these specialized fields-particularly in light industry, agriculture, and service-oriented disciplines-remain inadequately evaluated. To address this gap, we present SuperGPQA, a comprehensive benchmark that evaluates graduate-level knowledge and reasoning capabilities across 285 discipl","authors_text":"Bingli Wang, Chengdong Lin, Chenghua Zhong, Chenglin Cai, Chengtuo Cheng, Chenqing Wang, Chujie Zheng, Chun Zhang, David Ma, Dayiheng Liu, Ge Zhang, Guoyin Wang, Haoran Que, Hao Wang, Hongquan Lin, Jiaheng Liu, Jiajun Xu, Jian Yang, Jinyang Zhang, Junran Peng, Junting Zhou, Kaijing Ma, Kaixin Deng, Kexin Yang, Keyi Ding, King Zhu, Liang Chen, M-A-P Team, Meng Cao, Minghao Liu, Ming Xu, Min Yang, Qian Liu, Qige Qi, Qinrui Li, Qiyao Wang, Qunshu Lin, Ruibin Yuan, Rui Li, Shanghaoran Quan, Shawn Gavin, Shian Jia, Shi Qiu, Shi Wang, Shiwen Ni, Sichao Jiang, Siming Huang, Sirun Li, Siwei Wu, Tianhao Cheng, Tianhao Liang, Tianyang Pang, Tianyang Zhan, Tianyu Liu, Tianyu Zheng, Tyshawn Hsing, Wangchunshu Zhou, Wenbo Su, Wenhao Huang, Xiang Yue, Xiangyu Zheng, Xiaolong Jin, Xingjian Zhang, Xingwei Qu, Xingyuan Bu, Xinrun Du, Xiyue Zhang, Yang Gao, Yaoru Li, Yifan Chen, Yifan Yao, Yiming Liang, Yinghao Ma, Yiyan Liao, Yiya Wang, Yizhe Li, Yizhi Li, Yizhou Tan, Yongchi Zhao, Yuanhao Yue, Yuansheng Ni, Yubo Wang, Yuelin Bai, Yue Zhang, Yujia Qin, Yun Huang, Yunwen Li, Zekun Moore Wang, Zhaoqun Li, Zhaoxiang Zhang, Zhenlin Wei, Zhenzhu Yang, Zhongyuan Peng, Zhoufutu Wen, Zhoujun Li, Zifan Peng, Zili Wang","cross_cats":[],"headline":"SuperGPQA benchmark shows top LLMs reach only 61.82 percent accuracy across 285 graduate disciplines.","license":"http://creativecommons.org/publicdomain/zero/1.0/","primary_cat":"cs.CL","submitted_at":"2025-02-20T17:05:58Z","title":"SuperGPQA: Scaling LLM Evaluation across 285 Graduate Disciplines"},"references":{"count":121,"internal_anchors":1,"resolved_work":121,"sample":[{"cited_arxiv_id":"","doi":"10.48550/arxiv.2412.03205","is_internal_anchor":false,"ref_index":1,"title":"U-math: A university-level benchmark for evaluating mathematical skills in llms","work_id":"4fe452e9-89bf-48b7-9ab9-743ed3f4e445","year":2024},{"cited_arxiv_id":"2403.04652","doi":"10.18653/v1/d18-1259","is_internal_anchor":true,"ref_index":2,"title":"Yi: Open Foundation Models by 01.AI","work_id":"8efee8a1-5e3c-4851-9c65-18e3d1d9e769","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"According to Danto’s definition, context is an art world with modern aspects","work_id":"ec9df59e-9f2c-47c3-ba2a-70421ba7c367","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"“La Bayadère” is a ballet created during the French July Revolution","work_id":"efed4f45-0889-4bb3-8be6-dda7e0f5a4f9","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"The ballet “Sylvia” is a dance drama created during the Paris Commune period in 1871","work_id":"f96be600-e5fe-42e5-ab21-d6b456ec0256","year":null}],"snapshot_sha256":"370c1cd06c40c112bb0dc325c51cae27d5120fdd3b7f7d5d1780f7d900e7fc61"},"source":{"id":"2502.14739","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-16T00:41:53.807346Z","id":"75de24d0-ec88-4bf6-8e9e-706c6830f652","model_set":{"reader":"grok-4.3"},"one_line_summary":"SuperGPQA is a new benchmark that tests LLMs on graduate questions from 285 disciplines after human-LLM filtering, with current best models scoring 61.82 percent.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"SuperGPQA benchmark shows top LLMs reach only 61.82 percent accuracy across 285 graduate disciplines.","strongest_claim":"Our experimental results reveal significant room for improvement in the performance of current state-of-the-art LLMs across diverse knowledge domains (e.g., the reasoning-focused model DeepSeek-R1 achieved the highest accuracy of 61.82% on SuperGPQA), highlighting the considerable gap between current model capabilities and artificial general intelligence.","weakest_assumption":"The assumption that the Human-LLM collaborative filtering process produces questions that are genuinely graduate-level, unambiguous, and representative of each discipline without introducing selection bias or over-filtering difficult items."}},"verdict_id":"75de24d0-ec88-4bf6-8e9e-706c6830f652"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:d634f35255dbe1e7d6e46534b132e7c2de49e808c7b64e323980df2d1c4e79b5","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5e4221a4235efe16896596b5e18106ddc45ba53a8f135b2e1478ccf4344aabd6","cross_cats_sorted":[],"license":"http://creativecommons.org/publicdomain/zero/1.0/","primary_cat":"cs.CL","submitted_at":"2025-02-20T17:05:58Z","title_canon_sha256":"3d35ab16412f2a2d744cf7d1cdecfbf17234113ad2edc0b1e829f639d42a3ab9"},"schema_version":"1.0","source":{"id":"2502.14739","kind":"arxiv","version":4}},"canonical_sha256":"70ce0a513cb7957bad819741d1dfcd6919d880547dbe7becf8ab4e2b15317b7d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"70ce0a513cb7957bad819741d1dfcd6919d880547dbe7becf8ab4e2b15317b7d","first_computed_at":"2026-05-17T23:38:49.562806Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.562806Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"uT4vp/Tg9p9YuSVEOptfo3oYZLY+wYyFFtDk8+O5WxN0I8JlcQ3fhAggBu7y3odDYwcZP7ItKEELgBEQfAO+Aw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.563452Z","signed_message":"canonical_sha256_bytes"},"source_id":"2502.14739","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:d634f35255dbe1e7d6e46534b132e7c2de49e808c7b64e323980df2d1c4e79b5","sha256:7eb6d4826d46aecc35d3a4e04281c3188fe98f9e673953fb03bff5a486deb52a"],"state_sha256":"c6064442745904a439666eec072afd35439b0707712ac1a1b0103bb1956e4df7"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"chrsecp5qItH02uED62gCq3Sq447whboq1LRV24VxLjSlkGSDQc+bUsHpMuOdB+werxBO8pcWUANkUKoc47SCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-10T11:47:33.734511Z","bundle_sha256":"ff98eaa42562674476131b5026d61c4fff91c3bbdf610ae209f466680249d14e"}}