{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:D2NKPI6ALH4YL64F6EBITRH6XT","short_pith_number":"pith:D2NKPI6A","canonical_record":{"source":{"id":"2606.07001","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DB","submitted_at":"2026-06-05T07:44:34Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"6b6a1e195711896a9f66a2aa1e1a49726b0b25459860a76f2ba7d8a73913913c","abstract_canon_sha256":"cb479dbd5880cc38946760c8d0e45aea060c27c1a2f01cfe28accc50ada1a13c"},"schema_version":"1.0"},"canonical_sha256":"1e9aa7a3c059f985fb85f10289c4febcfcb3c0843f5232861cda3af9db0d9493","source":{"kind":"arxiv","id":"2606.07001","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.07001","created_at":"2026-06-08T01:04:40Z"},{"alias_kind":"arxiv_version","alias_value":"2606.07001v1","created_at":"2026-06-08T01:04:40Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.07001","created_at":"2026-06-08T01:04:40Z"},{"alias_kind":"pith_short_12","alias_value":"D2NKPI6ALH4Y","created_at":"2026-06-08T01:04:40Z"},{"alias_kind":"pith_short_16","alias_value":"D2NKPI6ALH4YL64F","created_at":"2026-06-08T01:04:40Z"},{"alias_kind":"pith_short_8","alias_value":"D2NKPI6A","created_at":"2026-06-08T01:04:40Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:D2NKPI6ALH4YL64F6EBITRH6XT","target":"record","payload":{"canonical_record":{"source":{"id":"2606.07001","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DB","submitted_at":"2026-06-05T07:44:34Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"6b6a1e195711896a9f66a2aa1e1a49726b0b25459860a76f2ba7d8a73913913c","abstract_canon_sha256":"cb479dbd5880cc38946760c8d0e45aea060c27c1a2f01cfe28accc50ada1a13c"},"schema_version":"1.0"},"canonical_sha256":"1e9aa7a3c059f985fb85f10289c4febcfcb3c0843f5232861cda3af9db0d9493","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-08T01:04:40.585535Z","signature_b64":"CI6kUsEYVL2oZVhb8qJR2sX/Z8iY2pYroFOvmUDKRu8A0CV0xpVGDMWQDsQzzoB7Ec2C0KD28o/hiUj8s2pMDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"1e9aa7a3c059f985fb85f10289c4febcfcb3c0843f5232861cda3af9db0d9493","last_reissued_at":"2026-06-08T01:04:40.584716Z","signature_status":"signed_v1","first_computed_at":"2026-06-08T01:04:40.584716Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2606.07001","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-08T01:04:40Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"e4XSDhNLSB/FPRtaBUBzXf7scqNmhQGViA0p7kLrybV19Pndy7xfuABgfyfYfrad+OJEmsR2mqJGPyNNbknzCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T09:59:36.064856Z"},"content_sha256":"cc829de3ce3d157ef775884f19e50ed5f563a3c7c7fc38098b073598f89784d6","schema_version":"1.0","event_id":"sha256:cc829de3ce3d157ef775884f19e50ed5f563a3c7c7fc38098b073598f89784d6"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:D2NKPI6ALH4YL64F6EBITRH6XT","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"DataEvolver: Automatic Data Preparation for Large Language Models through Multi-Level Self-Evolving","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.DB","authors_text":"Chao Deng, Ju Fan, Shaolei Zhang, Xiaoyong Du","submitted_at":"2026-06-05T07:44:34Z","abstract_excerpt":"High-quality training data is essential to large language models (LLMs) and typically requires extensive and costly manual curation. Existing automatic data preparation methods rely on predefined pipelines or customized human instructions, which limits their adaptability to diverse data distributions and lacks principled guidance from high-quality examples. In this paper, we introduce DataEvolver, the first self-evolving data preparation system that automatically constructs pipelines to transform raw data into high-quality data. DataEvolver employs a multi-level mechanism to ensure both pipeli"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.07001","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.07001/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-08T01:04:40Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"oWwl7YsJEeDt6v8oEMmmfjpQBOrzMluwj/JUs3tW29d2Z/aI5NwcHYcAygPYkQSwdgf5FJj+bl3XGX9BuYczDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T09:59:36.065218Z"},"content_sha256":"56dac7182106dc55f485cfef6b2b2f93c22622da541fe241745352c898d0d581","schema_version":"1.0","event_id":"sha256:56dac7182106dc55f485cfef6b2b2f93c22622da541fe241745352c898d0d581"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/D2NKPI6ALH4YL64F6EBITRH6XT/bundle.json","state_url":"https://pith.science/pith/D2NKPI6ALH4YL64F6EBITRH6XT/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/D2NKPI6ALH4YL64F6EBITRH6XT/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-09T09:59:36Z","links":{"resolver":"https://pith.science/pith/D2NKPI6ALH4YL64F6EBITRH6XT","bundle":"https://pith.science/pith/D2NKPI6ALH4YL64F6EBITRH6XT/bundle.json","state":"https://pith.science/pith/D2NKPI6ALH4YL64F6EBITRH6XT/state.json","well_known_bundle":"https://pith.science/.well-known/pith/D2NKPI6ALH4YL64F6EBITRH6XT/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:D2NKPI6ALH4YL64F6EBITRH6XT","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"cb479dbd5880cc38946760c8d0e45aea060c27c1a2f01cfe28accc50ada1a13c","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DB","submitted_at":"2026-06-05T07:44:34Z","title_canon_sha256":"6b6a1e195711896a9f66a2aa1e1a49726b0b25459860a76f2ba7d8a73913913c"},"schema_version":"1.0","source":{"id":"2606.07001","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.07001","created_at":"2026-06-08T01:04:40Z"},{"alias_kind":"arxiv_version","alias_value":"2606.07001v1","created_at":"2026-06-08T01:04:40Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.07001","created_at":"2026-06-08T01:04:40Z"},{"alias_kind":"pith_short_12","alias_value":"D2NKPI6ALH4Y","created_at":"2026-06-08T01:04:40Z"},{"alias_kind":"pith_short_16","alias_value":"D2NKPI6ALH4YL64F","created_at":"2026-06-08T01:04:40Z"},{"alias_kind":"pith_short_8","alias_value":"D2NKPI6A","created_at":"2026-06-08T01:04:40Z"}],"graph_snapshots":[{"event_id":"sha256:56dac7182106dc55f485cfef6b2b2f93c22622da541fe241745352c898d0d581","target":"graph","created_at":"2026-06-08T01:04:40Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.07001/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"High-quality training data is essential to large language models (LLMs) and typically requires extensive and costly manual curation. Existing automatic data preparation methods rely on predefined pipelines or customized human instructions, which limits their adaptability to diverse data distributions and lacks principled guidance from high-quality examples. In this paper, we introduce DataEvolver, the first self-evolving data preparation system that automatically constructs pipelines to transform raw data into high-quality data. DataEvolver employs a multi-level mechanism to ensure both pipeli","authors_text":"Chao Deng, Ju Fan, Shaolei Zhang, Xiaoyong Du","cross_cats":["cs.AI"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DB","submitted_at":"2026-06-05T07:44:34Z","title":"DataEvolver: Automatic Data Preparation for Large Language Models through Multi-Level Self-Evolving"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.07001","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:cc829de3ce3d157ef775884f19e50ed5f563a3c7c7fc38098b073598f89784d6","target":"record","created_at":"2026-06-08T01:04:40Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"cb479dbd5880cc38946760c8d0e45aea060c27c1a2f01cfe28accc50ada1a13c","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DB","submitted_at":"2026-06-05T07:44:34Z","title_canon_sha256":"6b6a1e195711896a9f66a2aa1e1a49726b0b25459860a76f2ba7d8a73913913c"},"schema_version":"1.0","source":{"id":"2606.07001","kind":"arxiv","version":1}},"canonical_sha256":"1e9aa7a3c059f985fb85f10289c4febcfcb3c0843f5232861cda3af9db0d9493","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"1e9aa7a3c059f985fb85f10289c4febcfcb3c0843f5232861cda3af9db0d9493","first_computed_at":"2026-06-08T01:04:40.584716Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-08T01:04:40.584716Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"CI6kUsEYVL2oZVhb8qJR2sX/Z8iY2pYroFOvmUDKRu8A0CV0xpVGDMWQDsQzzoB7Ec2C0KD28o/hiUj8s2pMDg==","signature_status":"signed_v1","signed_at":"2026-06-08T01:04:40.585535Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.07001","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:cc829de3ce3d157ef775884f19e50ed5f563a3c7c7fc38098b073598f89784d6","sha256:56dac7182106dc55f485cfef6b2b2f93c22622da541fe241745352c898d0d581"],"state_sha256":"c73f0c1bf4d13072d1bd02920d24ea3a611f0cd65faa15aad45b20b23a3a1bb1"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"umEydHXIgmPGHrekOB+/MsHfL0wlALUJ0z7Y5H5GH9Lb1Cz5MiVOsQ7OA7f7QXVHeqU/zb0JxviITMAogKbIAQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-09T09:59:36.067278Z","bundle_sha256":"5c48607396a5246b4c4355995d2e5bad2f693430355b76943570a680ab26a711"}}