{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2020:KPXUNR6THD3NXDGYDRYVC6QNO2","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"3749c25aaae21dcfecfa070717e38d7d70f9e1fbaa64207045cf52e3f8b4d422","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2020-09-24T03:17:19Z","title_canon_sha256":"d92c209d59272778bc18f45a0692e5200a239338ee2718041aa4910328593b2b"},"schema_version":"1.0","source":{"id":"2009.11462","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2009.11462","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2009.11462v2","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2009.11462","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"KPXUNR6THD3N","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"KPXUNR6THD3NXDGY","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"KPXUNR6T","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:a6ca9bdd508db6f7a26f73c36095fb63cb74fdfcf45f61f3e4286dd325abae34","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Using RealToxicityPrompts, we find that pretrained LMs can degenerate into toxic text even from seemingly innocuous prompts... no current method is failsafe against neural toxic degeneration."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the automated toxicity classifier produces scores that reliably correspond to human judgments of toxicity across diverse prompts and generations."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Language models produce toxic text from innocuous prompts, and no tested control method fully prevents it, demonstrated via a new 100K-prompt web-derived dataset."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Pretrained language models can generate toxic text from seemingly innocuous prompts, and no current control method prevents it reliably."}],"snapshot_sha256":"4b35b50293afaad2f088eb6c524f31a83d9fa50b6fc03b1d3193fee25a3882b1"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"ebc6b1d49076462c75a569a2f3f06601923192c18ccec6bef262004d5b1b8215"},"paper":{"abstract_excerpt":"Pretrained neural language models (LMs) are prone to generating racist, sexist, or otherwise toxic language which hinders their safe deployment. We investigate the extent to which pretrained LMs can be prompted to generate toxic language, and the effectiveness of controllable text generation algorithms at preventing such toxic degeneration. We create and release RealToxicityPrompts, a dataset of 100K naturally occurring, sentence-level prompts derived from a large corpus of English web text, paired with toxicity scores from a widely-used toxicity classifier. Using RealToxicityPrompts, we find ","authors_text":"Maarten Sap, Noah A. Smith, Samuel Gehman, Suchin Gururangan, Yejin Choi","cross_cats":[],"headline":"Pretrained language models can generate toxic text from seemingly innocuous prompts, and no current control method prevents it reliably.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2020-09-24T03:17:19Z","title":"RealToxicityPrompts: Evaluating Neural Toxic Degeneration in Language Models"},"references":{"count":12,"internal_anchors":1,"resolved_work":12,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"In Proceedings of the First Workshop on Gender Bias in Natural Language Processing, pages 33–39, Florence, Italy","work_id":"10cede94-a482-403f-93ff-42ba663eb54a","year":2018},{"cited_arxiv_id":"1607.04606","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Enriching word vectors with subword information","work_id":"8d8270fc-359a-49cb-948b-94200e98ccb1","year":2016},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"In Proceedings of the 51st Annual Meeting of the Association for Compu- tational Linguistics (V olume 1: Long Papers), pages 250–259, Soﬁa, Bulgaria","work_id":"2b1d67a3-0dd6-4754-933a-37e1ec84650e","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Lucas Dixon, John Li, Jeffrey Scott Sorensen, Nithum Thain, and Lucy Vasserman","work_id":"931e7131-2846-4dfe-90d6-99bec022b2b4","year":2018},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"In Proceedings of the 28th International Conference on International Conference on Machine Learning , ICML’11, page 10411048, Madison, WI, USA","work_id":"ca3997fc-540b-408d-be33-1037560236a9","year":null}],"snapshot_sha256":"110fd506bac8dd5f6dd040ced38f0803a301d2321ff610a7144027b1e5073b91"},"source":{"id":"2009.11462","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T18:16:59.965453Z","id":"8d0a36ef-b202-4602-baae-9b86514c0835","model_set":{"reader":"grok-4.3"},"one_line_summary":"Language models produce toxic text from innocuous prompts, and no tested control method fully prevents it, demonstrated via a new 100K-prompt web-derived dataset.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Pretrained language models can generate toxic text from seemingly innocuous prompts, and no current control method prevents it reliably.","strongest_claim":"Using RealToxicityPrompts, we find that pretrained LMs can degenerate into toxic text even from seemingly innocuous prompts... no current method is failsafe against neural toxic degeneration.","weakest_assumption":"That the automated toxicity classifier produces scores that reliably correspond to human judgments of toxicity across diverse prompts and generations."}},"verdict_id":"8d0a36ef-b202-4602-baae-9b86514c0835"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:0f81b701986ecc90ddb089eec22095dd11f02e358863c527995ef83fe2d1e967","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"3749c25aaae21dcfecfa070717e38d7d70f9e1fbaa64207045cf52e3f8b4d422","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2020-09-24T03:17:19Z","title_canon_sha256":"d92c209d59272778bc18f45a0692e5200a239338ee2718041aa4910328593b2b"},"schema_version":"1.0","source":{"id":"2009.11462","kind":"arxiv","version":2}},"canonical_sha256":"53ef46c7d338f6db8cd81c71517a0d768edd406efdeedc909e2b1c1243e8fbf3","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"53ef46c7d338f6db8cd81c71517a0d768edd406efdeedc909e2b1c1243e8fbf3","first_computed_at":"2026-05-17T23:38:50.603114Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.603114Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"wu0RJEDg1thKew0NVwTxrsB+5n2fXRI4KKvGCzi1dIcMEoF+EcQYJNpvCVZRuFUuGh9K4DvWkFaWBXrHnunWAg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.603637Z","signed_message":"canonical_sha256_bytes"},"source_id":"2009.11462","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:0f81b701986ecc90ddb089eec22095dd11f02e358863c527995ef83fe2d1e967","sha256:a6ca9bdd508db6f7a26f73c36095fb63cb74fdfcf45f61f3e4286dd325abae34"],"state_sha256":"6f807b1b1f5d99861248d11f30f2013bacd83c9a97e4c3523a0cbdd8ceb7aa27"}