{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:DXYECBGOX6GNDMZQNJPFM46SRI","short_pith_number":"pith:DXYECBGO","schema_version":"1.0","canonical_sha256":"1df04104cebf8cd1b3306a5e5673d28a04a170f0d1a35e7d81b04335cce4d6c6","source":{"kind":"arxiv","id":"2606.10487","version":1},"attestation_state":"computed","paper":{"title":"Stop Early, Spend Less: Hidden-State Probes as a Practical Recipe for Streaming Moderation of LLM Outputs","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Huizhen Shu, Piao Xue, Xuying Li","submitted_at":"2026-06-09T07:01:43Z","abstract_excerpt":"Deploying large language models in user-facing systems requires efficient output safety filtering. Existing approaches typically rely on a separate moderation model applied after generation, which doubles inference cost and only detects violations after generation completes. We observe that the signal needed for moderation is already present in the model hidden states. Based on this, we train lightweight token-level probes that operate directly on internal activations, producing per-token safety scores that can be aggregated for both offline evaluation and online intervention. The probe reuses"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.10487","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-09T07:01:43Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"c2a999f68e3265812459f427c45e1a31ccf230c28bc4f2242ac1ef81e6a7b479","abstract_canon_sha256":"b49c9543bc2125ed5eaee5ff383737b3708f5f047b56fa0ebe3b92dea4e6961f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-10T01:10:21.935343Z","signature_b64":"ue1q0ehxTlo28FZjYh8STtCcNse0obXMnP1EdAXqkOjiMW2Nimc09v2O5xbthnTAcK4n6jKqfPWSTYOuO35cAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"1df04104cebf8cd1b3306a5e5673d28a04a170f0d1a35e7d81b04335cce4d6c6","last_reissued_at":"2026-06-10T01:10:21.934434Z","signature_status":"signed_v1","first_computed_at":"2026-06-10T01:10:21.934434Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Stop Early, Spend Less: Hidden-State Probes as a Practical Recipe for Streaming Moderation of LLM Outputs","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Huizhen Shu, Piao Xue, Xuying Li","submitted_at":"2026-06-09T07:01:43Z","abstract_excerpt":"Deploying large language models in user-facing systems requires efficient output safety filtering. Existing approaches typically rely on a separate moderation model applied after generation, which doubles inference cost and only detects violations after generation completes. We observe that the signal needed for moderation is already present in the model hidden states. Based on this, we train lightweight token-level probes that operate directly on internal activations, producing per-token safety scores that can be aggregated for both offline evaluation and online intervention. The probe reuses"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.10487","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.10487/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.10487","created_at":"2026-06-10T01:10:21.934576+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.10487v1","created_at":"2026-06-10T01:10:21.934576+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.10487","created_at":"2026-06-10T01:10:21.934576+00:00"},{"alias_kind":"pith_short_12","alias_value":"DXYECBGOX6GN","created_at":"2026-06-10T01:10:21.934576+00:00"},{"alias_kind":"pith_short_16","alias_value":"DXYECBGOX6GNDMZQ","created_at":"2026-06-10T01:10:21.934576+00:00"},{"alias_kind":"pith_short_8","alias_value":"DXYECBGO","created_at":"2026-06-10T01:10:21.934576+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/DXYECBGOX6GNDMZQNJPFM46SRI","json":"https://pith.science/pith/DXYECBGOX6GNDMZQNJPFM46SRI.json","graph_json":"https://pith.science/api/pith-number/DXYECBGOX6GNDMZQNJPFM46SRI/graph.json","events_json":"https://pith.science/api/pith-number/DXYECBGOX6GNDMZQNJPFM46SRI/events.json","paper":"https://pith.science/paper/DXYECBGO"},"agent_actions":{"view_html":"https://pith.science/pith/DXYECBGOX6GNDMZQNJPFM46SRI","download_json":"https://pith.science/pith/DXYECBGOX6GNDMZQNJPFM46SRI.json","view_paper":"https://pith.science/paper/DXYECBGO","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.10487&json=true","fetch_graph":"https://pith.science/api/pith-number/DXYECBGOX6GNDMZQNJPFM46SRI/graph.json","fetch_events":"https://pith.science/api/pith-number/DXYECBGOX6GNDMZQNJPFM46SRI/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/DXYECBGOX6GNDMZQNJPFM46SRI/action/timestamp_anchor","attest_storage":"https://pith.science/pith/DXYECBGOX6GNDMZQNJPFM46SRI/action/storage_attestation","attest_author":"https://pith.science/pith/DXYECBGOX6GNDMZQNJPFM46SRI/action/author_attestation","sign_citation":"https://pith.science/pith/DXYECBGOX6GNDMZQNJPFM46SRI/action/citation_signature","submit_replication":"https://pith.science/pith/DXYECBGOX6GNDMZQNJPFM46SRI/action/replication_record"}},"created_at":"2026-06-10T01:10:21.934576+00:00","updated_at":"2026-06-10T01:10:21.934576+00:00"}