{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:IEGGIGIZW4TECUAB65OUX6ZS6U","short_pith_number":"pith:IEGGIGIZ","canonical_record":{"source":{"id":"2606.11499","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-06-09T22:44:47Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"aa38d645fdd4889e56d5be44d151c0ac5ec4079441e464d8e3704dd16e21e015","abstract_canon_sha256":"08977b1ee80bdf865a55e41af544d6860aa5e0c50f0ae20957cb03985a05acc0"},"schema_version":"1.0"},"canonical_sha256":"410c641919b726415001f75d4bfb32f53f3226d6294ff57fa3365228e560f774","source":{"kind":"arxiv","id":"2606.11499","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.11499","created_at":"2026-06-11T01:09:52Z"},{"alias_kind":"arxiv_version","alias_value":"2606.11499v1","created_at":"2026-06-11T01:09:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.11499","created_at":"2026-06-11T01:09:52Z"},{"alias_kind":"pith_short_12","alias_value":"IEGGIGIZW4TE","created_at":"2026-06-11T01:09:52Z"},{"alias_kind":"pith_short_16","alias_value":"IEGGIGIZW4TECUAB","created_at":"2026-06-11T01:09:52Z"},{"alias_kind":"pith_short_8","alias_value":"IEGGIGIZ","created_at":"2026-06-11T01:09:52Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:IEGGIGIZW4TECUAB65OUX6ZS6U","target":"record","payload":{"canonical_record":{"source":{"id":"2606.11499","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-06-09T22:44:47Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"aa38d645fdd4889e56d5be44d151c0ac5ec4079441e464d8e3704dd16e21e015","abstract_canon_sha256":"08977b1ee80bdf865a55e41af544d6860aa5e0c50f0ae20957cb03985a05acc0"},"schema_version":"1.0"},"canonical_sha256":"410c641919b726415001f75d4bfb32f53f3226d6294ff57fa3365228e560f774","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-11T01:09:52.535049Z","signature_b64":"z4dNH3xgemdcyP1PemrjHtjoHryvVkaQdlR5UwwB83dPmWHyEdLdA0A+5Ze9AffS0J8N+lNeh+cyEfBBnIUYBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"410c641919b726415001f75d4bfb32f53f3226d6294ff57fa3365228e560f774","last_reissued_at":"2026-06-11T01:09:52.534216Z","signature_status":"signed_v1","first_computed_at":"2026-06-11T01:09:52.534216Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2606.11499","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-11T01:09:52Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"x2yPb67c9hctbI3/bNqseRZNbe6G7tL0+m3karisWsY6s5VpeWEgggkH72sJqsCVN7estJSjAkkT4NgTlh5yBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-30T17:26:52.287173Z"},"content_sha256":"8a7ab79a9fa64e80a9488eada3e53ec2a7b495e560199be2f781707e36d0edc6","schema_version":"1.0","event_id":"sha256:8a7ab79a9fa64e80a9488eada3e53ec2a7b495e560199be2f781707e36d0edc6"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:IEGGIGIZW4TECUAB65OUX6ZS6U","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Hubs or Fringes: Pretraining Data Selection via Web Graph Centrality","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Danqi Chen, Vedant Badoni, Xinyi Wang","submitted_at":"2026-06-09T22:44:47Z","abstract_excerpt":"The performance of modern language models depends critically on pretraining data composition. Yet existing data selection methods rely on auxiliary classifiers for document scoring or mixture optimization, adding computational overhead and dependence on labeled data. We propose WebGraphMix, a lightweight data selection framework that computes structural centrality scores over the Common Crawl host-level web graph and uses them to vary the proportion of central versus peripheral documents in the pretraining mixture. We hypothesize that central hosts expose models to reusable abstractions, while"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.11499","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.11499/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-11T01:09:52Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"FDlMjIwAX0kmJZh6bSc2/5lbMC62QCgR8hgVxxgjUm6u3cFl9wCe7tDj9mr9o7Z1jCtippGki89NUkyHhEQqCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-30T17:26:52.287564Z"},"content_sha256":"00b1acfc803f138c75e4a21e4f5bb4166230db1be2d28539c60506e67cb4bfd3","schema_version":"1.0","event_id":"sha256:00b1acfc803f138c75e4a21e4f5bb4166230db1be2d28539c60506e67cb4bfd3"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/IEGGIGIZW4TECUAB65OUX6ZS6U/bundle.json","state_url":"https://pith.science/pith/IEGGIGIZW4TECUAB65OUX6ZS6U/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/IEGGIGIZW4TECUAB65OUX6ZS6U/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-30T17:26:52Z","links":{"resolver":"https://pith.science/pith/IEGGIGIZW4TECUAB65OUX6ZS6U","bundle":"https://pith.science/pith/IEGGIGIZW4TECUAB65OUX6ZS6U/bundle.json","state":"https://pith.science/pith/IEGGIGIZW4TECUAB65OUX6ZS6U/state.json","well_known_bundle":"https://pith.science/.well-known/pith/IEGGIGIZW4TECUAB65OUX6ZS6U/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:IEGGIGIZW4TECUAB65OUX6ZS6U","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"08977b1ee80bdf865a55e41af544d6860aa5e0c50f0ae20957cb03985a05acc0","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-06-09T22:44:47Z","title_canon_sha256":"aa38d645fdd4889e56d5be44d151c0ac5ec4079441e464d8e3704dd16e21e015"},"schema_version":"1.0","source":{"id":"2606.11499","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.11499","created_at":"2026-06-11T01:09:52Z"},{"alias_kind":"arxiv_version","alias_value":"2606.11499v1","created_at":"2026-06-11T01:09:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.11499","created_at":"2026-06-11T01:09:52Z"},{"alias_kind":"pith_short_12","alias_value":"IEGGIGIZW4TE","created_at":"2026-06-11T01:09:52Z"},{"alias_kind":"pith_short_16","alias_value":"IEGGIGIZW4TECUAB","created_at":"2026-06-11T01:09:52Z"},{"alias_kind":"pith_short_8","alias_value":"IEGGIGIZ","created_at":"2026-06-11T01:09:52Z"}],"graph_snapshots":[{"event_id":"sha256:00b1acfc803f138c75e4a21e4f5bb4166230db1be2d28539c60506e67cb4bfd3","target":"graph","created_at":"2026-06-11T01:09:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.11499/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"The performance of modern language models depends critically on pretraining data composition. Yet existing data selection methods rely on auxiliary classifiers for document scoring or mixture optimization, adding computational overhead and dependence on labeled data. We propose WebGraphMix, a lightweight data selection framework that computes structural centrality scores over the Common Crawl host-level web graph and uses them to vary the proportion of central versus peripheral documents in the pretraining mixture. We hypothesize that central hosts expose models to reusable abstractions, while","authors_text":"Danqi Chen, Vedant Badoni, Xinyi Wang","cross_cats":["cs.AI"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-06-09T22:44:47Z","title":"Hubs or Fringes: Pretraining Data Selection via Web Graph Centrality"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.11499","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:8a7ab79a9fa64e80a9488eada3e53ec2a7b495e560199be2f781707e36d0edc6","target":"record","created_at":"2026-06-11T01:09:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"08977b1ee80bdf865a55e41af544d6860aa5e0c50f0ae20957cb03985a05acc0","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-06-09T22:44:47Z","title_canon_sha256":"aa38d645fdd4889e56d5be44d151c0ac5ec4079441e464d8e3704dd16e21e015"},"schema_version":"1.0","source":{"id":"2606.11499","kind":"arxiv","version":1}},"canonical_sha256":"410c641919b726415001f75d4bfb32f53f3226d6294ff57fa3365228e560f774","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"410c641919b726415001f75d4bfb32f53f3226d6294ff57fa3365228e560f774","first_computed_at":"2026-06-11T01:09:52.534216Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-11T01:09:52.534216Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"z4dNH3xgemdcyP1PemrjHtjoHryvVkaQdlR5UwwB83dPmWHyEdLdA0A+5Ze9AffS0J8N+lNeh+cyEfBBnIUYBg==","signature_status":"signed_v1","signed_at":"2026-06-11T01:09:52.535049Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.11499","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:8a7ab79a9fa64e80a9488eada3e53ec2a7b495e560199be2f781707e36d0edc6","sha256:00b1acfc803f138c75e4a21e4f5bb4166230db1be2d28539c60506e67cb4bfd3"],"state_sha256":"1e7dce12a42121a7880ba29f5d604381587c2816cd7f0871751192d6ee1359f6"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"wAO6LrND27aEYDac4AzbrfW3EzokkO7v2dPH+VXoe5uGobvqgLnbg7OJo3p4jUkYd6syErnFkb7Rbxndw9bqCQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-30T17:26:52.289605Z","bundle_sha256":"a48987109c17d84a7b6593a05911a30e8e3c7615b1fcdfa43a8a901fa02fc8af"}}