{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:HXYMX2CFBZAJXRNSMOT6LZKUF4","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"ac15d4a42f91981b8017d4fa47669631efec53bffaee2c95d6976d7f41569718","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-16T17:22:34Z","title_canon_sha256":"4733da43d086cb7c956ee3e6c0dc69f5a1465f6bb7146af6e3f007628a66bdf3"},"schema_version":"1.0","source":{"id":"2606.18192","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.18192","created_at":"2026-06-19T16:12:06Z"},{"alias_kind":"arxiv_version","alias_value":"2606.18192v2","created_at":"2026-06-19T16:12:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.18192","created_at":"2026-06-19T16:12:06Z"},{"alias_kind":"pith_short_12","alias_value":"HXYMX2CFBZAJ","created_at":"2026-06-19T16:12:06Z"},{"alias_kind":"pith_short_16","alias_value":"HXYMX2CFBZAJXRNS","created_at":"2026-06-19T16:12:06Z"},{"alias_kind":"pith_short_8","alias_value":"HXYMX2CF","created_at":"2026-06-19T16:12:06Z"}],"graph_snapshots":[{"event_id":"sha256:d3193b55c1deb632d65ebb12a0432653db61a6fbdfdac9a8008f1a50e6417d9c","target":"graph","created_at":"2026-06-19T16:12:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.18192/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"As high-quality public web corpora become increasingly exhausted, clean long-context documents have become a scarce and expensive source of training data for large language models (LLMs). Existing long-context corpora are often proprietary and costly to acquire, synthetically generated, or concentrated in narrow domains such as programming. We introduce the Stanford EDGAR Filings Dataset (SEFD), an open reconstruction of SEC filings into layout-faithful MultiMarkdown for financial language modeling and evaluation. SEFD makes audited financial statements, risk disclosures, ownership reports, ac","authors_text":"Kay Giesecke, Nick Bettencourt, Xiaowei Ding","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-16T17:22:34Z","title":"The Stanford EDGAR Filings Dataset: Reconstructing U.S. Corporate and Financial Disclosures into Layout-Faithful and Token-Efficient Pretraining Data"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.18192","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:d22a6b33345f23deed458eb6943c67bac8868e33bc73e1e011d8bdab67a286c4","target":"record","created_at":"2026-06-19T16:12:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"ac15d4a42f91981b8017d4fa47669631efec53bffaee2c95d6976d7f41569718","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-16T17:22:34Z","title_canon_sha256":"4733da43d086cb7c956ee3e6c0dc69f5a1465f6bb7146af6e3f007628a66bdf3"},"schema_version":"1.0","source":{"id":"2606.18192","kind":"arxiv","version":2}},"canonical_sha256":"3df0cbe8450e409bc5b263a7e5e5542f0060938f801ffbc4b7a3609a5cb876a0","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"3df0cbe8450e409bc5b263a7e5e5542f0060938f801ffbc4b7a3609a5cb876a0","first_computed_at":"2026-06-19T16:12:06.846070Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-19T16:12:06.846070Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"KVJfNyFPhfKWtTnta2jyQl1ufghH9tv9CsKbJIt/7iCYVIeTc6QtrSvBonQ9y9uZf05/YM3AqzQjERsDODiTBQ==","signature_status":"signed_v1","signed_at":"2026-06-19T16:12:06.846559Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.18192","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:d22a6b33345f23deed458eb6943c67bac8868e33bc73e1e011d8bdab67a286c4","sha256:d3193b55c1deb632d65ebb12a0432653db61a6fbdfdac9a8008f1a50e6417d9c"],"state_sha256":"79d68e67847c65b78e86ab200053f224ebaccd4d1163a2d880ab5caaa570940d"}