{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:LB6M4H2YOBUYZTQQJZFXAT43CI","short_pith_number":"pith:LB6M4H2Y","schema_version":"1.0","canonical_sha256":"587cce1f5870698cce104e4b704f9b1212246c162f98492efdd1460b0bfacb21","source":{"kind":"arxiv","id":"2605.14074","version":1},"attestation_state":"computed","paper":{"title":"Fair and Calibrated Toxicity Detection with Robust Training and Abstention","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Toxicity detectors hide calibration unfairness across identity subgroups despite near-perfect overall scores.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Mokshit Surana","submitted_at":"2026-05-13T19:50:35Z","abstract_excerpt":"Fairness in toxicity classification involves three integrated axes: ranking, calibration, and abstention. Training-time interventions and post-hoc safety mechanisms cannot be evaluated independently because the former determines the efficacy of the latter. We compare Empirical Risk Minimization (ERM), instance-level reweighting, and Group DRO across these axes, combined with temperature scaling, confidence-based abstention, and per-identity threshold optimization. Evaluation uses subgroup AUC, BPSN/BNSP AUC, error gaps, and per-subgroup Expected Calibration Error (ECE) with bootstrap CIs ($n ="},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2605.14074","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T19:50:35Z","cross_cats_sorted":[],"title_canon_sha256":"6474fa5dc1d69786de7cc7a9010ec3710929a3e99b89268e89646fbe05119328","abstract_canon_sha256":"3ecd350b56951bc7424beb6da781a938e660910527815ad3d5ee02497cd04d29"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:12.382759Z","signature_b64":"aOrA0vZITgS43WfzhlpVtqwumZY/vI+uieSuxC2OGRAEjmUrdftwvw/pMw8Ba1Hk4G1ucmalhz38Nj59XGKvBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"587cce1f5870698cce104e4b704f9b1212246c162f98492efdd1460b0bfacb21","last_reissued_at":"2026-05-17T23:39:12.382333Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:12.382333Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Fair and Calibrated Toxicity Detection with Robust Training and Abstention","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Toxicity detectors hide calibration unfairness across identity subgroups despite near-perfect overall scores.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Mokshit Surana","submitted_at":"2026-05-13T19:50:35Z","abstract_excerpt":"Fairness in toxicity classification involves three integrated axes: ranking, calibration, and abstention. Training-time interventions and post-hoc safety mechanisms cannot be evaluated independently because the former determines the efficacy of the latter. We compare Empirical Risk Minimization (ERM), instance-level reweighting, and Group DRO across these axes, combined with temperature scaling, confidence-based abstention, and per-identity threshold optimization. Evaluation uses subgroup AUC, BPSN/BNSP AUC, error gaps, and per-subgroup Expected Calibration Error (ECE) with bootstrap CIs ($n ="},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Calibration disparity is a hidden fairness violation. ERM has near-perfect aggregate calibration (0.013) but is significantly miscalibrated across all identity subgroups (+0.029 to +0.134). Training interventions reshape rather than eliminate disparity, and abstention itself is unfair.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the chosen subgroup definitions, metrics (subgroup AUC, BPSN/BNSP AUC, ECE), and bootstrap CIs fully capture real-world fairness harms and that post-hoc methods can be evaluated independently of training choices.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Training interventions reshape rather than eliminate calibration and abstention disparities in toxicity detection, requiring a multi-axis fairness framework.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Toxicity detectors hide calibration unfairness across identity subgroups despite near-perfect overall scores.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"d5dbafe0cf106450938bf58f5b8e6ed37c21bb28e07fb256dd3d2486e4f4ed8e"},"source":{"id":"2605.14074","kind":"arxiv","version":1},"verdict":{"id":"489b6283-612b-4940-97b4-9db6692fba1e","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T05:06:16.459603Z","strongest_claim":"Calibration disparity is a hidden fairness violation. ERM has near-perfect aggregate calibration (0.013) but is significantly miscalibrated across all identity subgroups (+0.029 to +0.134). Training interventions reshape rather than eliminate disparity, and abstention itself is unfair.","one_line_summary":"Training interventions reshape rather than eliminate calibration and abstention disparities in toxicity detection, requiring a multi-axis fairness framework.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the chosen subgroup definitions, metrics (subgroup AUC, BPSN/BNSP AUC, ECE), and bootstrap CIs fully capture real-world fairness harms and that post-hoc methods can be evaluated independently of training choices.","pith_extraction_headline":"Toxicity detectors hide calibration unfairness across identity subgroups despite near-perfect overall scores."},"references":{"count":11,"sample":[{"doi":"","year":2019,"title":"Borkan, D., Dixon, L., Sorensen, J., Thain, N., and Vasserman, L. (2019). Nuanced metrics for measuring unintended bias with real data for text classification.WWW Companion","work_id":"3caf62a1-4189-425f-a7df-ea89c0d1365a","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2018,"title":"Dixon, L., Li, J., Sorensen, J., Thain, N., and Vasserman, L. (2018). Measuring and mitigating unintended bias in text classification.AAAI/ACM AIES","work_id":"55b0a4ff-f708-47f0-83c5-a5d676db6a39","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2017,"title":"Geifman, Y ., and El-Yaniv, R. (2017). Selective classification for deep neural networks.NeurIPS 2017","work_id":"021ef3f4-3004-4ba0-bbdb-d05a03b7acbc","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2017,"title":"Guo, C., Pleiss, G., Sun, Y ., and Weinberger, K. Q. (2017). On calibration of modern neural networks.ICML 2017","work_id":"c1da11f2-7c41-4a02-9ee0-58c0967b7a7f","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Y ., Arjovsky, M., Pezeshki, M., and Lopez-Paz, D","work_id":"2430ab6f-f9b0-43ac-a6ce-15eba2f4d7c3","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":11,"snapshot_sha256":"599498bc6f961f09f584cce49721295015273607d73c97fee8ca86f11f643391","internal_anchors":0},"formal_canon":{"evidence_count":1,"snapshot_sha256":"1f8a2ee6f87ae8a710e85312f7b8c0cfe251e854650c344287e394fd91e1ad8a"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.14074","created_at":"2026-05-17T23:39:12.382395+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.14074v1","created_at":"2026-05-17T23:39:12.382395+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14074","created_at":"2026-05-17T23:39:12.382395+00:00"},{"alias_kind":"pith_short_12","alias_value":"LB6M4H2YOBUY","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"LB6M4H2YOBUYZTQQ","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"LB6M4H2Y","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":1,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/LB6M4H2YOBUYZTQQJZFXAT43CI","json":"https://pith.science/pith/LB6M4H2YOBUYZTQQJZFXAT43CI.json","graph_json":"https://pith.science/api/pith-number/LB6M4H2YOBUYZTQQJZFXAT43CI/graph.json","events_json":"https://pith.science/api/pith-number/LB6M4H2YOBUYZTQQJZFXAT43CI/events.json","paper":"https://pith.science/paper/LB6M4H2Y"},"agent_actions":{"view_html":"https://pith.science/pith/LB6M4H2YOBUYZTQQJZFXAT43CI","download_json":"https://pith.science/pith/LB6M4H2YOBUYZTQQJZFXAT43CI.json","view_paper":"https://pith.science/paper/LB6M4H2Y","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.14074&json=true","fetch_graph":"https://pith.science/api/pith-number/LB6M4H2YOBUYZTQQJZFXAT43CI/graph.json","fetch_events":"https://pith.science/api/pith-number/LB6M4H2YOBUYZTQQJZFXAT43CI/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/LB6M4H2YOBUYZTQQJZFXAT43CI/action/timestamp_anchor","attest_storage":"https://pith.science/pith/LB6M4H2YOBUYZTQQJZFXAT43CI/action/storage_attestation","attest_author":"https://pith.science/pith/LB6M4H2YOBUYZTQQJZFXAT43CI/action/author_attestation","sign_citation":"https://pith.science/pith/LB6M4H2YOBUYZTQQJZFXAT43CI/action/citation_signature","submit_replication":"https://pith.science/pith/LB6M4H2YOBUYZTQQJZFXAT43CI/action/replication_record"}},"created_at":"2026-05-17T23:39:12.382395+00:00","updated_at":"2026-05-17T23:39:12.382395+00:00"}