{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:Y4V6HMACAJD67ZG4JGLUWEGRKY","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"066e71923d82a2f4dd5e76eb6faedb1a5f9f66947a96db1434281bc01d53e406","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T10:04:38Z","title_canon_sha256":"953e1d82724d775a6928ad2fe96e76f53fc9a7250e47331bcf88840cc3f13822"},"schema_version":"1.0","source":{"id":"2605.13290","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13290","created_at":"2026-05-18T02:44:49Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13290v1","created_at":"2026-05-18T02:44:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13290","created_at":"2026-05-18T02:44:49Z"},{"alias_kind":"pith_short_12","alias_value":"Y4V6HMACAJD6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"Y4V6HMACAJD67ZG4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"Y4V6HMAC","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:446836da96218d01bf4696d36564150df5288cb8b140574f3e5915fb1b5a6849","target":"graph","created_at":"2026-05-18T02:44:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Our analysis reveals that these intrinsic metrics demonstrate strong and significant correlations with downstream model performance. Crucially, we find that the predictors of utility are scale-dependent."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the semantically distinct variants of a single Polish reasoning dataset are representative enough for the observed scale-dependent patterns to generalize to other languages, domains, and model families."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Intrinsic data metrics predict reasoning dataset utility for model fine-tuning, with different predictors working best for smaller versus larger models."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Intrinsic metrics on reasoning data strongly predict downstream model performance in a scale-dependent way."}],"snapshot_sha256":"7f0eb79a92eaab0f73b7ab1aaae158a6f28540d8356609464db8dbac00f15659"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"32ed1c712062ede0c407f0f3e6e1e84463501d7e7edcdd33fb9702a8ab84cf4b"},"paper":{"abstract_excerpt":"Validating training data for reasoning models typically requires expensive trial-and-error fine-tuning cycles. In this work, we investigate whether the utility of a reasoning dataset can be reliably predicted prior to training using intrinsic data metrics. We propose a suite of quantitative measures and evaluate their predictive power by fine-tuning 8B and 11B models on semantically distinct variants of a Polish reasoning dataset. Our analysis reveals that these intrinsic metrics demonstrate strong and significant correlations with downstream model performance. Crucially, we find that the pred","authors_text":"Dzmitry Pihulski, Jan Eliasz, Jan Koco\\'n, Maciej Piasecki, Micha{\\l} Rajkowski, Miko{\\l}aj Langner, Przemys{\\l}aw Kazienko, Teddy Ferdinan","cross_cats":[],"headline":"Intrinsic metrics on reasoning data strongly predict downstream model performance in a scale-dependent way.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T10:04:38Z","title":"What properties of reasoning supervision are associated with improved downstream model quality?"},"references":{"count":40,"internal_anchors":3,"resolved_work":40,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Bandarkar, L., et al.: The belebele benchmark: a parallel reading comprehension dataset in 122 language variants. In: ACL. pp. 749–775 (2024) 14 M. Langner et al","work_id":"3091917c-7714-465e-9941-18561a936ec8","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Bercovich, A., et al.: Llama-nemotron: Efficient reasoning models (2025)","work_id":"79c13186-7c10-4942-a0c5-e37f4f47ee2e","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"A.et al.Global piqa: Evaluating physical commonsense reasoning across 100+ languages and cultures (2025)","work_id":"5856c7a6-6f3b-42c6-9514-56c1131cee21","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"In: Proceedings of SIGMOD","work_id":"1b11590f-9469-4da6-8e98-8553efac27bd","year":2024},{"cited_arxiv_id":"2505.05410","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Reasoning Models Don't Always Say What They Think","work_id":"b9bdcbf5-9ae0-464c-b1a6-de04f85a6e33","year":2025}],"snapshot_sha256":"47b8fbb337353a491caa8c71e44764f4e4697465cb11ca88ba221d7c10c9eeba"},"source":{"id":"2605.13290","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T19:32:58.449379Z","id":"1fff5dc3-7360-4d54-aaab-d5224eb28b18","model_set":{"reader":"grok-4.3"},"one_line_summary":"Intrinsic data metrics predict reasoning dataset utility for model fine-tuning, with different predictors working best for smaller versus larger models.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Intrinsic metrics on reasoning data strongly predict downstream model performance in a scale-dependent way.","strongest_claim":"Our analysis reveals that these intrinsic metrics demonstrate strong and significant correlations with downstream model performance. Crucially, we find that the predictors of utility are scale-dependent.","weakest_assumption":"That the semantically distinct variants of a single Polish reasoning dataset are representative enough for the observed scale-dependent patterns to generalize to other languages, domains, and model families."}},"verdict_id":"1fff5dc3-7360-4d54-aaab-d5224eb28b18"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:7d98dcde916b681e8b582f65a7992fd7804f28236ce2d2a79c6a5e20a752721b","target":"record","created_at":"2026-05-18T02:44:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"066e71923d82a2f4dd5e76eb6faedb1a5f9f66947a96db1434281bc01d53e406","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T10:04:38Z","title_canon_sha256":"953e1d82724d775a6928ad2fe96e76f53fc9a7250e47331bcf88840cc3f13822"},"schema_version":"1.0","source":{"id":"2605.13290","kind":"arxiv","version":1}},"canonical_sha256":"c72be3b0020247efe4dc49974b10d1563dc8e33e016149d2e6e10327808b9913","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c72be3b0020247efe4dc49974b10d1563dc8e33e016149d2e6e10327808b9913","first_computed_at":"2026-05-18T02:44:49.126010Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:44:49.126010Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"fvmq8c1Ux4ddgx6DR2b7exQPpOreY0vpzBcleGzGgBcUfIYhaEkpuHlzJYakkAs42Dy+87w2b5WDPBuAsCuyAQ==","signature_status":"signed_v1","signed_at":"2026-05-18T02:44:49.126441Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13290","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:7d98dcde916b681e8b582f65a7992fd7804f28236ce2d2a79c6a5e20a752721b","sha256:446836da96218d01bf4696d36564150df5288cb8b140574f3e5915fb1b5a6849"],"state_sha256":"e11e85f6cac9f61e41cf21ba25ff413007d35d0d4c11d0d8cfc330e24b36302d"}