{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:2EUCEFTYFNC2ZF4RKEWOHMVSRW","short_pith_number":"pith:2EUCEFTY","schema_version":"1.0","canonical_sha256":"d1282216782b45ac9791512ce3b2b28d913b37baf452eb5064bdedd721fcc2e2","source":{"kind":"arxiv","id":"2605.21482","version":1},"attestation_state":"computed","paper":{"title":"DeepWeb-Bench: A Deep Research Benchmark Demanding Massive Cross-Source Evidence and Long-Horizon Derivation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Baoqing Sun, Chongyang Pan, Haiyang Shen, Jiuzheng Wang, Mugeng Liu, Peilun Jia, Siqi Zhong, Sixiong Xie, Xiang Jing, Yun Ma, Zhuofan Shi","submitted_at":"2026-05-20T17:59:03Z","abstract_excerpt":"Deep research, in which an agent searches the open web, collects evidence, and derives an answer through extended reasoning, is a prominent use case for frontier language models. Frontier deep research products score high on existing benchmarks, making it difficult to distinguish their capabilities from current evaluation data alone. We introduce DeepWeb-Bench, a deep research benchmark that is substantially harder than existing benchmarks for the current frontier. Difficulty comes from three properties of the data itself: each task requires massive evidence collection, cross-source reconcilia"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.21482","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-20T17:59:03Z","cross_cats_sorted":[],"title_canon_sha256":"1e7122a99fd92404679130308878613ffd38ed7a8aed9006049c2119feeccc0a","abstract_canon_sha256":"b9c6addde6c6256fc1439ab99e392899d086f8d40f7feea1880342f4114fc616"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T02:05:39.393965Z","signature_b64":"ltB9a7ZVFOhfknk/TX2CRJ78W41k7b/pDT3ZOO5B+DRx6JTW+GCkvfpufcedyJv+XWVCx2gnlt6tacPEAewrAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d1282216782b45ac9791512ce3b2b28d913b37baf452eb5064bdedd721fcc2e2","last_reissued_at":"2026-05-21T02:05:39.393125Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T02:05:39.393125Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"DeepWeb-Bench: A Deep Research Benchmark Demanding Massive Cross-Source Evidence and Long-Horizon Derivation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Baoqing Sun, Chongyang Pan, Haiyang Shen, Jiuzheng Wang, Mugeng Liu, Peilun Jia, Siqi Zhong, Sixiong Xie, Xiang Jing, Yun Ma, Zhuofan Shi","submitted_at":"2026-05-20T17:59:03Z","abstract_excerpt":"Deep research, in which an agent searches the open web, collects evidence, and derives an answer through extended reasoning, is a prominent use case for frontier language models. Frontier deep research products score high on existing benchmarks, making it difficult to distinguish their capabilities from current evaluation data alone. We introduce DeepWeb-Bench, a deep research benchmark that is substantially harder than existing benchmarks for the current frontier. Difficulty comes from three properties of the data itself: each task requires massive evidence collection, cross-source reconcilia"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.21482","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.21482/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.21482","created_at":"2026-05-21T02:05:39.393264+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.21482v1","created_at":"2026-05-21T02:05:39.393264+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.21482","created_at":"2026-05-21T02:05:39.393264+00:00"},{"alias_kind":"pith_short_12","alias_value":"2EUCEFTYFNC2","created_at":"2026-05-21T02:05:39.393264+00:00"},{"alias_kind":"pith_short_16","alias_value":"2EUCEFTYFNC2ZF4R","created_at":"2026-05-21T02:05:39.393264+00:00"},{"alias_kind":"pith_short_8","alias_value":"2EUCEFTY","created_at":"2026-05-21T02:05:39.393264+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/2EUCEFTYFNC2ZF4RKEWOHMVSRW","json":"https://pith.science/pith/2EUCEFTYFNC2ZF4RKEWOHMVSRW.json","graph_json":"https://pith.science/api/pith-number/2EUCEFTYFNC2ZF4RKEWOHMVSRW/graph.json","events_json":"https://pith.science/api/pith-number/2EUCEFTYFNC2ZF4RKEWOHMVSRW/events.json","paper":"https://pith.science/paper/2EUCEFTY"},"agent_actions":{"view_html":"https://pith.science/pith/2EUCEFTYFNC2ZF4RKEWOHMVSRW","download_json":"https://pith.science/pith/2EUCEFTYFNC2ZF4RKEWOHMVSRW.json","view_paper":"https://pith.science/paper/2EUCEFTY","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.21482&json=true","fetch_graph":"https://pith.science/api/pith-number/2EUCEFTYFNC2ZF4RKEWOHMVSRW/graph.json","fetch_events":"https://pith.science/api/pith-number/2EUCEFTYFNC2ZF4RKEWOHMVSRW/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/2EUCEFTYFNC2ZF4RKEWOHMVSRW/action/timestamp_anchor","attest_storage":"https://pith.science/pith/2EUCEFTYFNC2ZF4RKEWOHMVSRW/action/storage_attestation","attest_author":"https://pith.science/pith/2EUCEFTYFNC2ZF4RKEWOHMVSRW/action/author_attestation","sign_citation":"https://pith.science/pith/2EUCEFTYFNC2ZF4RKEWOHMVSRW/action/citation_signature","submit_replication":"https://pith.science/pith/2EUCEFTYFNC2ZF4RKEWOHMVSRW/action/replication_record"}},"created_at":"2026-05-21T02:05:39.393264+00:00","updated_at":"2026-05-21T02:05:39.393264+00:00"}