{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:7F3HT4XHWON3TP5E7FPLKHV3OV","short_pith_number":"pith:7F3HT4XH","canonical_record":{"source":{"id":"2504.13161","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-04-17T17:58:13Z","cross_cats_sorted":[],"title_canon_sha256":"ea3e64743b38ca5e47e86c70d06b4182138b1965508bef33424aba9ba4106691","abstract_canon_sha256":"beeeb5700d355783df460d2f39a7b42479fc06e472e98e25be418b70f9bf51ec"},"schema_version":"1.0"},"canonical_sha256":"f97679f2e7b39bb9bfa4f95eb51ebb754da148564a5076fb37f1d01fbf0faf9a","source":{"kind":"arxiv","id":"2504.13161","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2504.13161","created_at":"2026-06-23T20:14:13Z"},{"alias_kind":"arxiv_version","alias_value":"2504.13161v2","created_at":"2026-06-23T20:14:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.13161","created_at":"2026-06-23T20:14:13Z"},{"alias_kind":"pith_short_12","alias_value":"7F3HT4XHWON3","created_at":"2026-06-23T20:14:13Z"},{"alias_kind":"pith_short_16","alias_value":"7F3HT4XHWON3TP5E","created_at":"2026-06-23T20:14:13Z"},{"alias_kind":"pith_short_8","alias_value":"7F3HT4XH","created_at":"2026-06-23T20:14:13Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:7F3HT4XHWON3TP5E7FPLKHV3OV","target":"record","payload":{"canonical_record":{"source":{"id":"2504.13161","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-04-17T17:58:13Z","cross_cats_sorted":[],"title_canon_sha256":"ea3e64743b38ca5e47e86c70d06b4182138b1965508bef33424aba9ba4106691","abstract_canon_sha256":"beeeb5700d355783df460d2f39a7b42479fc06e472e98e25be418b70f9bf51ec"},"schema_version":"1.0"},"canonical_sha256":"f97679f2e7b39bb9bfa4f95eb51ebb754da148564a5076fb37f1d01fbf0faf9a","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T20:14:13.772171Z","signature_b64":"wtqB/5aEEp9k/1JwTRu5cQFo3ihd1fL6nHROfpEXQXToRg1n9eOxryBCZKEZNmq8liiI5ByHHRssgXt5U5WuDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f97679f2e7b39bb9bfa4f95eb51ebb754da148564a5076fb37f1d01fbf0faf9a","last_reissued_at":"2026-06-23T20:14:13.770636Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T20:14:13.770636Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2504.13161","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-23T20:14:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Qle1ruFB12UkWEcDwNyGK8BAtbUVCkF09NrFI5OI+mqeKR5j1/ibt4fbqsKxOxoY+W2N2RyaRzyZg+URH25iDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-03T22:57:21.551644Z"},"content_sha256":"1f634d4d708c05509b588adeb814471f1e78abf0e595f0b3fcf6ea8f527fcb96","schema_version":"1.0","event_id":"sha256:1f634d4d708c05509b588adeb814471f1e78abf0e595f0b3fcf6ea8f527fcb96"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:7F3HT4XHWON3TP5E7FPLKHV3OV","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Nemotron-CLIMB: CLustering-based Iterative Data Mixture Bootstrapping for Language Model Pre-training","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Dan Su, Hongxu Yin, Jan Kautz, Markus Kliegl, Mostofa Patwary, Pavlo Molchanov, Peter Belcak, Shizhe Diao, Xin Dong, Yingyan Lin, Yonggan Fu, Yoshi Suhara, Yu Yang, Zijia Chen","submitted_at":"2025-04-17T17:58:13Z","abstract_excerpt":"Pre-training datasets are typically collected from web content and lack inherent domain divisions. For instance, widely used datasets like Common Crawl do not include explicit domain labels, while manually curating labeled datasets such as The Pile is labor-intensive. Consequently, identifying an optimal pre-training data mixture remains a challenging problem, despite its significant benefits for pre-training performance. To address these challenges, we propose CLustering-based Iterative Data Mixture Bootstrapping (Nemotron-CLIMB), an automated framework that discovers, evaluates, and refines "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2504.13161","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2504.13161/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-23T20:14:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"sxV7qy1a//sDcR6jON6oAzv7x06icYbdzvt9y5Fd4uNROsa9AM47zV7c0d4D+wp7QVUb3G4pUChmK0sOmLDNCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-03T22:57:21.552018Z"},"content_sha256":"c2032cbf7cf86172c60e1b322e3fc596e387390eadb98264a5fd975a1fb60b0a","schema_version":"1.0","event_id":"sha256:c2032cbf7cf86172c60e1b322e3fc596e387390eadb98264a5fd975a1fb60b0a"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/7F3HT4XHWON3TP5E7FPLKHV3OV/bundle.json","state_url":"https://pith.science/pith/7F3HT4XHWON3TP5E7FPLKHV3OV/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/7F3HT4XHWON3TP5E7FPLKHV3OV/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-07-03T22:57:21Z","links":{"resolver":"https://pith.science/pith/7F3HT4XHWON3TP5E7FPLKHV3OV","bundle":"https://pith.science/pith/7F3HT4XHWON3TP5E7FPLKHV3OV/bundle.json","state":"https://pith.science/pith/7F3HT4XHWON3TP5E7FPLKHV3OV/state.json","well_known_bundle":"https://pith.science/.well-known/pith/7F3HT4XHWON3TP5E7FPLKHV3OV/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:7F3HT4XHWON3TP5E7FPLKHV3OV","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"beeeb5700d355783df460d2f39a7b42479fc06e472e98e25be418b70f9bf51ec","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-04-17T17:58:13Z","title_canon_sha256":"ea3e64743b38ca5e47e86c70d06b4182138b1965508bef33424aba9ba4106691"},"schema_version":"1.0","source":{"id":"2504.13161","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2504.13161","created_at":"2026-06-23T20:14:13Z"},{"alias_kind":"arxiv_version","alias_value":"2504.13161v2","created_at":"2026-06-23T20:14:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.13161","created_at":"2026-06-23T20:14:13Z"},{"alias_kind":"pith_short_12","alias_value":"7F3HT4XHWON3","created_at":"2026-06-23T20:14:13Z"},{"alias_kind":"pith_short_16","alias_value":"7F3HT4XHWON3TP5E","created_at":"2026-06-23T20:14:13Z"},{"alias_kind":"pith_short_8","alias_value":"7F3HT4XH","created_at":"2026-06-23T20:14:13Z"}],"graph_snapshots":[{"event_id":"sha256:c2032cbf7cf86172c60e1b322e3fc596e387390eadb98264a5fd975a1fb60b0a","target":"graph","created_at":"2026-06-23T20:14:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2504.13161/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Pre-training datasets are typically collected from web content and lack inherent domain divisions. For instance, widely used datasets like Common Crawl do not include explicit domain labels, while manually curating labeled datasets such as The Pile is labor-intensive. Consequently, identifying an optimal pre-training data mixture remains a challenging problem, despite its significant benefits for pre-training performance. To address these challenges, we propose CLustering-based Iterative Data Mixture Bootstrapping (Nemotron-CLIMB), an automated framework that discovers, evaluates, and refines ","authors_text":"Dan Su, Hongxu Yin, Jan Kautz, Markus Kliegl, Mostofa Patwary, Pavlo Molchanov, Peter Belcak, Shizhe Diao, Xin Dong, Yingyan Lin, Yonggan Fu, Yoshi Suhara, Yu Yang, Zijia Chen","cross_cats":[],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-04-17T17:58:13Z","title":"Nemotron-CLIMB: CLustering-based Iterative Data Mixture Bootstrapping for Language Model Pre-training"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2504.13161","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:1f634d4d708c05509b588adeb814471f1e78abf0e595f0b3fcf6ea8f527fcb96","target":"record","created_at":"2026-06-23T20:14:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"beeeb5700d355783df460d2f39a7b42479fc06e472e98e25be418b70f9bf51ec","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-04-17T17:58:13Z","title_canon_sha256":"ea3e64743b38ca5e47e86c70d06b4182138b1965508bef33424aba9ba4106691"},"schema_version":"1.0","source":{"id":"2504.13161","kind":"arxiv","version":2}},"canonical_sha256":"f97679f2e7b39bb9bfa4f95eb51ebb754da148564a5076fb37f1d01fbf0faf9a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"f97679f2e7b39bb9bfa4f95eb51ebb754da148564a5076fb37f1d01fbf0faf9a","first_computed_at":"2026-06-23T20:14:13.770636Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-23T20:14:13.770636Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"wtqB/5aEEp9k/1JwTRu5cQFo3ihd1fL6nHROfpEXQXToRg1n9eOxryBCZKEZNmq8liiI5ByHHRssgXt5U5WuDA==","signature_status":"signed_v1","signed_at":"2026-06-23T20:14:13.772171Z","signed_message":"canonical_sha256_bytes"},"source_id":"2504.13161","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:1f634d4d708c05509b588adeb814471f1e78abf0e595f0b3fcf6ea8f527fcb96","sha256:c2032cbf7cf86172c60e1b322e3fc596e387390eadb98264a5fd975a1fb60b0a"],"state_sha256":"9bc4c1649bb27a6c19ad35309a5d3087e93bc247f60d4b3e987d6c027cf48010"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"KRzmVdswJn9XrA4lAFhi98eT9zp/1SfDP7vJfGcezYiZzZhQRF9qBkhO1gYxjqucu5Alegof0xzRDbtiNb5tCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-07-03T22:57:21.554170Z","bundle_sha256":"c92a457b9c58c059f5e1ebb9c23b42d47ca689d188f36380b00bcb88450f780c"}}