{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:5CF2FRIB7XEHHQQBCH4BJQVLJD","short_pith_number":"pith:5CF2FRIB","canonical_record":{"source":{"id":"2602.06713","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"stat.ML","submitted_at":"2026-02-06T14:02:12Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"fb588127e8260d5200ea0ca53675ea64cfe3ea06cf46aea5de227913b21a62dd","abstract_canon_sha256":"154ba1dec97baf864cb26a2c7b78f7007eafd1ae3ba81abd47c4ebadabcf3e3b"},"schema_version":"1.0"},"canonical_sha256":"e88ba2c501fdc873c20111f814c2ab48f262c39038db48000c9e16c1097bdf1e","source":{"kind":"arxiv","id":"2602.06713","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.06713","created_at":"2026-05-18T02:45:05Z"},{"alias_kind":"arxiv_version","alias_value":"2602.06713v2","created_at":"2026-05-18T02:45:05Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.06713","created_at":"2026-05-18T02:45:05Z"},{"alias_kind":"pith_short_12","alias_value":"5CF2FRIB7XEH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"5CF2FRIB7XEHHQQB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"5CF2FRIB","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:5CF2FRIB7XEHHQQBCH4BJQVLJD","target":"record","payload":{"canonical_record":{"source":{"id":"2602.06713","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"stat.ML","submitted_at":"2026-02-06T14:02:12Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"fb588127e8260d5200ea0ca53675ea64cfe3ea06cf46aea5de227913b21a62dd","abstract_canon_sha256":"154ba1dec97baf864cb26a2c7b78f7007eafd1ae3ba81abd47c4ebadabcf3e3b"},"schema_version":"1.0"},"canonical_sha256":"e88ba2c501fdc873c20111f814c2ab48f262c39038db48000c9e16c1097bdf1e","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:45:05.373950Z","signature_b64":"GdO84lyKsQPnjtYVvzJ+UPkufKXXT6/VtfOI7GYwBTNaN6ZHcbWCx7h/fTMQD4uoeGvLwV3eLqh2JtIC1/qRAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e88ba2c501fdc873c20111f814c2ab48f262c39038db48000c9e16c1097bdf1e","last_reissued_at":"2026-05-18T02:45:05.373339Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:45:05.373339Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.06713","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:45:05Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"WMMNFK/sdxsBMzhTMiztBLuXKdXdH32VEgxsBgwORE8d4n3crrE3NqBF2JcP6EjGIuY8Fu8Eod+be8wmLc50AQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-07T06:10:19.485158Z"},"content_sha256":"5f0c72df58c8a462419f8f6454a9db9787e7fdd5482ca0f291c4b56debb88f8a","schema_version":"1.0","event_id":"sha256:5f0c72df58c8a462419f8f6454a9db9787e7fdd5482ca0f291c4b56debb88f8a"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:5CF2FRIB7XEHHQQBCH4BJQVLJD","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Distribution Shift in Missing Data Imputation: A Risk-Based Perspective and Importance-Weighted Correction under MAR","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Standard imputation fails to minimize full-data error under MAR because observed training data differs in distribution from the target.","cross_cats":["cs.LG"],"primary_cat":"stat.ML","authors_text":"Katarzyna Reluga, Luke Shannon, Song Liu","submitted_at":"2026-02-06T14:02:12Z","abstract_excerpt":"Missing data imputation, where a model is trained on observed data to estimate unobserved values, is a fundamental problem in machine learning. In this paper, we rigorously formulate imputation model learning as a mean-squared error risk minimisation problem. We show that when the probability of missingness depends on the data, many state-of-the-art methods fail to account for the resulting distribution shift between the observed data used for training and the full data distribution used for evaluation. Consequently, these approaches do not minimise mean-squared error on the full data distribu"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We propose a novel imputation algorithm designed to learn an imputation model from the observed data while explicitly accounting for this distribution shift. Simulation studies show consistent improvements over otherwise identical uncorrected baselines, with average reductions of 3% in RMSE and 7% in Wasserstein distance.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The missingness mechanism satisfies MAR so that missingness probabilities can be estimated from observed data alone and used to form reliable importance weights without introducing additional bias.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Standard imputation methods fail to minimize full-data MSE under MAR due to distribution shift; a new importance-weighted algorithm corrects for it and improves RMSE by 3% and Wasserstein distance by 7% in simulations.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Standard imputation fails to minimize full-data error under MAR because observed training data differs in distribution from the target.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"2b74ca602ee9409a4ed1b6157c837657fb5d1abbbe461502918f8c2b4546eb5b"},"source":{"id":"2602.06713","kind":"arxiv","version":2},"verdict":{"id":"65e325c2-a198-461b-a1f4-a1a6b29e8e96","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T06:29:58.014416Z","strongest_claim":"We propose a novel imputation algorithm designed to learn an imputation model from the observed data while explicitly accounting for this distribution shift. Simulation studies show consistent improvements over otherwise identical uncorrected baselines, with average reductions of 3% in RMSE and 7% in Wasserstein distance.","one_line_summary":"Standard imputation methods fail to minimize full-data MSE under MAR due to distribution shift; a new importance-weighted algorithm corrects for it and improves RMSE by 3% and Wasserstein distance by 7% in simulations.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The missingness mechanism satisfies MAR so that missingness probabilities can be estimated from observed data alone and used to form reliable importance weights without introducing additional bias.","pith_extraction_headline":"Standard imputation fails to minimize full-data error under MAR because observed training data differs in distribution from the target."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"299a63bac86bfe264fbf0edb7d1718f5144f3cc61b20e6b5ffc8204d7aef562b"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"65e325c2-a198-461b-a1f4-a1a6b29e8e96"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:45:05Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"pmBZw8fI2Kf1yccxqqrLYwDYVS3RdbrqQ/yLjVD0OVcqBbOXU14B1p+CZX5ZJC5mgxlhycvTC7Fn8co6P3UPBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-07T06:10:19.486013Z"},"content_sha256":"00ffda1329991a8b411d9c9fba94c356bb276c681be67e7d197e1fca0aa8e262","schema_version":"1.0","event_id":"sha256:00ffda1329991a8b411d9c9fba94c356bb276c681be67e7d197e1fca0aa8e262"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/5CF2FRIB7XEHHQQBCH4BJQVLJD/bundle.json","state_url":"https://pith.science/pith/5CF2FRIB7XEHHQQBCH4BJQVLJD/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/5CF2FRIB7XEHHQQBCH4BJQVLJD/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-07T06:10:19Z","links":{"resolver":"https://pith.science/pith/5CF2FRIB7XEHHQQBCH4BJQVLJD","bundle":"https://pith.science/pith/5CF2FRIB7XEHHQQBCH4BJQVLJD/bundle.json","state":"https://pith.science/pith/5CF2FRIB7XEHHQQBCH4BJQVLJD/state.json","well_known_bundle":"https://pith.science/.well-known/pith/5CF2FRIB7XEHHQQBCH4BJQVLJD/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:5CF2FRIB7XEHHQQBCH4BJQVLJD","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"154ba1dec97baf864cb26a2c7b78f7007eafd1ae3ba81abd47c4ebadabcf3e3b","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"stat.ML","submitted_at":"2026-02-06T14:02:12Z","title_canon_sha256":"fb588127e8260d5200ea0ca53675ea64cfe3ea06cf46aea5de227913b21a62dd"},"schema_version":"1.0","source":{"id":"2602.06713","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.06713","created_at":"2026-05-18T02:45:05Z"},{"alias_kind":"arxiv_version","alias_value":"2602.06713v2","created_at":"2026-05-18T02:45:05Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.06713","created_at":"2026-05-18T02:45:05Z"},{"alias_kind":"pith_short_12","alias_value":"5CF2FRIB7XEH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"5CF2FRIB7XEHHQQB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"5CF2FRIB","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:00ffda1329991a8b411d9c9fba94c356bb276c681be67e7d197e1fca0aa8e262","target":"graph","created_at":"2026-05-18T02:45:05Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We propose a novel imputation algorithm designed to learn an imputation model from the observed data while explicitly accounting for this distribution shift. Simulation studies show consistent improvements over otherwise identical uncorrected baselines, with average reductions of 3% in RMSE and 7% in Wasserstein distance."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The missingness mechanism satisfies MAR so that missingness probabilities can be estimated from observed data alone and used to form reliable importance weights without introducing additional bias."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Standard imputation methods fail to minimize full-data MSE under MAR due to distribution shift; a new importance-weighted algorithm corrects for it and improves RMSE by 3% and Wasserstein distance by 7% in simulations."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Standard imputation fails to minimize full-data error under MAR because observed training data differs in distribution from the target."}],"snapshot_sha256":"2b74ca602ee9409a4ed1b6157c837657fb5d1abbbe461502918f8c2b4546eb5b"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"299a63bac86bfe264fbf0edb7d1718f5144f3cc61b20e6b5ffc8204d7aef562b"},"paper":{"abstract_excerpt":"Missing data imputation, where a model is trained on observed data to estimate unobserved values, is a fundamental problem in machine learning. In this paper, we rigorously formulate imputation model learning as a mean-squared error risk minimisation problem. We show that when the probability of missingness depends on the data, many state-of-the-art methods fail to account for the resulting distribution shift between the observed data used for training and the full data distribution used for evaluation. Consequently, these approaches do not minimise mean-squared error on the full data distribu","authors_text":"Katarzyna Reluga, Luke Shannon, Song Liu","cross_cats":["cs.LG"],"headline":"Standard imputation fails to minimize full-data error under MAR because observed training data differs in distribution from the target.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"stat.ML","submitted_at":"2026-02-06T14:02:12Z","title":"Distribution Shift in Missing Data Imputation: A Risk-Based Perspective and Importance-Weighted Correction under MAR"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.06713","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T06:29:58.014416Z","id":"65e325c2-a198-461b-a1f4-a1a6b29e8e96","model_set":{"reader":"grok-4.3"},"one_line_summary":"Standard imputation methods fail to minimize full-data MSE under MAR due to distribution shift; a new importance-weighted algorithm corrects for it and improves RMSE by 3% and Wasserstein distance by 7% in simulations.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Standard imputation fails to minimize full-data error under MAR because observed training data differs in distribution from the target.","strongest_claim":"We propose a novel imputation algorithm designed to learn an imputation model from the observed data while explicitly accounting for this distribution shift. Simulation studies show consistent improvements over otherwise identical uncorrected baselines, with average reductions of 3% in RMSE and 7% in Wasserstein distance.","weakest_assumption":"The missingness mechanism satisfies MAR so that missingness probabilities can be estimated from observed data alone and used to form reliable importance weights without introducing additional bias."}},"verdict_id":"65e325c2-a198-461b-a1f4-a1a6b29e8e96"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:5f0c72df58c8a462419f8f6454a9db9787e7fdd5482ca0f291c4b56debb88f8a","target":"record","created_at":"2026-05-18T02:45:05Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"154ba1dec97baf864cb26a2c7b78f7007eafd1ae3ba81abd47c4ebadabcf3e3b","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"stat.ML","submitted_at":"2026-02-06T14:02:12Z","title_canon_sha256":"fb588127e8260d5200ea0ca53675ea64cfe3ea06cf46aea5de227913b21a62dd"},"schema_version":"1.0","source":{"id":"2602.06713","kind":"arxiv","version":2}},"canonical_sha256":"e88ba2c501fdc873c20111f814c2ab48f262c39038db48000c9e16c1097bdf1e","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"e88ba2c501fdc873c20111f814c2ab48f262c39038db48000c9e16c1097bdf1e","first_computed_at":"2026-05-18T02:45:05.373339Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:45:05.373339Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"GdO84lyKsQPnjtYVvzJ+UPkufKXXT6/VtfOI7GYwBTNaN6ZHcbWCx7h/fTMQD4uoeGvLwV3eLqh2JtIC1/qRAw==","signature_status":"signed_v1","signed_at":"2026-05-18T02:45:05.373950Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.06713","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:5f0c72df58c8a462419f8f6454a9db9787e7fdd5482ca0f291c4b56debb88f8a","sha256:00ffda1329991a8b411d9c9fba94c356bb276c681be67e7d197e1fca0aa8e262"],"state_sha256":"60a53010ff2368543bb0b072f093e1339a8094499f47257c7f589a97c40a20be"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"JlGDTWkth2h85f5WBA9qkf6QNAHxlEawXaC8o5cP+UVST2ZICXSbLa+JlRkigA7uNwlNcS2CxwabjM6A72ZUCQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-07T06:10:19.490899Z","bundle_sha256":"208c28f07e33af1a72b9aa1b45168542652502a4ba2302f2adc3f3b65af10b8c"}}