{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:XNC5N7RT5EM2TACSV6F2WZJCEX","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f0ca5adb20be8b703074165071c0771a274bdeb6faaaa54436042e11cc476101","cross_cats_sorted":["cs.DL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2025-11-14T06:54:48Z","title_canon_sha256":"b9e3ab69407080e883de8c39f3e1f4e7962c186fa6e00e25666ec936589fd6a1"},"schema_version":"1.0","source":{"id":"2511.11010","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2511.11010","created_at":"2026-05-18T03:09:33Z"},{"alias_kind":"arxiv_version","alias_value":"2511.11010v2","created_at":"2026-05-18T03:09:33Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.11010","created_at":"2026-05-18T03:09:33Z"},{"alias_kind":"pith_short_12","alias_value":"XNC5N7RT5EM2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"XNC5N7RT5EM2TACS","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"XNC5N7RT","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8b1ac59b9fb9f0d7b9aeef6300535a214b898dc31bcab9b3e9c13a8ec53404eb","target":"graph","created_at":"2026-05-18T03:09:33Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We introduce GovScape, a public search system that supports four primary forms of search over these 10 million PDFs: ... semantic text search and visual search against the PDFs across individual pages... total estimated compute cost for GovScape's pre-processing pipeline for 10 million PDFs was approximately $1,500, equivalent to 47,000 PDF pages per dollar spent on compute."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the chosen embedding models and visual search components produce sufficiently accurate results for the intended use cases without extensive user studies or quantitative evaluation of retrieval quality reported in the abstract."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"GovScape delivers multimodal search over 10 million government PDFs using metadata, exact text, semantic embeddings, and visual page features at an estimated $1,500 preprocessing cost."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A public system enables semantic and visual searches over 10 million federal government PDFs at roughly $1,500 in preprocessing cost."}],"snapshot_sha256":"536b9828dad4028c48fc3e2984aecbf2ed0e17fb8af523dc5796258438ed0d2d"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"adb91ff8866acad288de80bf6efdf0355ac564697f3ca823b299a8e03c0e0925"},"paper":{"abstract_excerpt":"Efforts over the past three decades have produced web archives containing billions of webpage snapshots and petabytes of data. The End of Term Web Archive alone contains, among other file types, millions of PDFs produced by the federal government. While preservation with web archives has been successful, significant challenges for access and discoverability remain. For example, current affordances for browsing the End of Term PDFs are limited to downloading and browsing individual PDFs, as well as performing basic keyword search across them. In this paper, we introduce GovScape, a public searc","authors_text":"Alison Yan, Benjamin Charles Germain Lee, Claire Gong, Kyle Deeds, Leslie Harka, Mark Phillips, Samuel J Klein, Shannon Zejiang Shen, Shreya Shaji, Trevor Owens, Ying-Hsiang Huang","cross_cats":["cs.DL"],"headline":"A public system enables semantic and visual searches over 10 million federal government PDFs at roughly $1,500 in preprocessing cost.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2025-11-14T06:54:48Z","title":"GovScape: A Public Multimodal Search System for 70 Million Pages of Government PDFs"},"references":{"count":42,"internal_anchors":3,"resolved_work":42,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"History in the age of abundance? : how the web is transforming historical research,","work_id":"131bae69-4272-4824-a5a7-70b0ffaf8489","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"End of term web archive dataset: Longitudinal web archive of .gov and .mil domains,","work_id":"10544369-01c2-48d0-803a-544449fb55c6","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"‘go fish’: Conceptualising the challenges of engaging national web archives for digital research,","work_id":"bdacfad3-94c3-4ddd-9ad3-4eea0fa41052","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Collection search","work_id":"0aea395c-06ed-4a46-abba-402bca46fc31","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Learning transferable visual models from natural language supervi- sion,","work_id":"69dd50bf-2e39-4bc6-ab05-8ef9cce676d6","year":2021}],"snapshot_sha256":"8bae908e5805334e2be407e7f87d55313f2a07d2f808da3b600bb2a73721cea0"},"source":{"id":"2511.11010","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-17T22:50:52.307049Z","id":"f93e65ba-1c99-47f2-a6dd-921f326e67b9","model_set":{"reader":"grok-4.3"},"one_line_summary":"GovScape delivers multimodal search over 10 million government PDFs using metadata, exact text, semantic embeddings, and visual page features at an estimated $1,500 preprocessing cost.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A public system enables semantic and visual searches over 10 million federal government PDFs at roughly $1,500 in preprocessing cost.","strongest_claim":"We introduce GovScape, a public search system that supports four primary forms of search over these 10 million PDFs: ... semantic text search and visual search against the PDFs across individual pages... total estimated compute cost for GovScape's pre-processing pipeline for 10 million PDFs was approximately $1,500, equivalent to 47,000 PDF pages per dollar spent on compute.","weakest_assumption":"That the chosen embedding models and visual search components produce sufficiently accurate results for the intended use cases without extensive user studies or quantitative evaluation of retrieval quality reported in the abstract."}},"verdict_id":"f93e65ba-1c99-47f2-a6dd-921f326e67b9"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:ba12b578e0262376928cd4496ba307b00f51a44259a34c3b8cb998b64c6bdc0d","target":"record","created_at":"2026-05-18T03:09:33Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f0ca5adb20be8b703074165071c0771a274bdeb6faaaa54436042e11cc476101","cross_cats_sorted":["cs.DL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2025-11-14T06:54:48Z","title_canon_sha256":"b9e3ab69407080e883de8c39f3e1f4e7962c186fa6e00e25666ec936589fd6a1"},"schema_version":"1.0","source":{"id":"2511.11010","kind":"arxiv","version":2}},"canonical_sha256":"bb45d6fe33e919a98052af8bab652225c57060321173420488435b4b7661c984","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"bb45d6fe33e919a98052af8bab652225c57060321173420488435b4b7661c984","first_computed_at":"2026-05-18T03:09:33.253682Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:09:33.253682Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"fgNIJ/w509El1r1Smh75TCf9XJ0LIEqTItC/NTJaTysIe+Jp6SbADsFCjNa7MgHMFc4bWBH0OsNkQCPONO7WAw==","signature_status":"signed_v1","signed_at":"2026-05-18T03:09:33.254230Z","signed_message":"canonical_sha256_bytes"},"source_id":"2511.11010","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:ba12b578e0262376928cd4496ba307b00f51a44259a34c3b8cb998b64c6bdc0d","sha256:8b1ac59b9fb9f0d7b9aeef6300535a214b898dc31bcab9b3e9c13a8ec53404eb"],"state_sha256":"c2aa23bab74937df6fb0ed8b3a6d50ece15318edc5f434af2a3ab5ec09c4428c"}