{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:STEVJY6CYHYU4GRPMWXRHNVAP2","short_pith_number":"pith:STEVJY6C","schema_version":"1.0","canonical_sha256":"94c954e3c2c1f14e1a2f65af13b6a07e9fed098907cbc2340130c28d6c07cddd","source":{"kind":"arxiv","id":"2606.07631","version":1},"attestation_state":"computed","paper":{"title":"Trait-space Monitoring for Emergent Misalignment During Supervised Finetuning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CY"],"primary_cat":"cs.LG","authors_text":"Hal Daum\\'e III, Huy Nghiem, Sarah Wiegreffe, Sy-Tuyen Ho","submitted_at":"2026-05-31T04:28:21Z","abstract_excerpt":"Emergent misalignment (EM) occurs when narrow finetuning causes a model to behave dangerously outside the finetuning task. Standard training signals can miss this shift, making reliable detection costly if it depends on repeated behavioral evaluation. We ask whether emergent misalignment can instead be detected from internal representations during finetuning. Using seven alignment-relevant traits encoded as linear directions in activation space, we track representational drift across training checkpoints in four open-source 7-9B LLMs. EM-relevant drift concentrates on a low-dimensional axis th"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.07631","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-31T04:28:21Z","cross_cats_sorted":["cs.AI","cs.CY"],"title_canon_sha256":"1c949e7fc7fe22e1bc185ecf4a9a066120e7e4f4c9ac93bf19b62bb15ca7e393","abstract_canon_sha256":"35e318a81a995000dce9c8b5823bb33b1be88729b186229c76bd12f77393a13d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T00:04:45.249121Z","signature_b64":"0tMZJx+SJBwCovxd7LaQns4aid7RLE9SxDbeiAjNfEiUZdrbqhiFh5ik16NQgUsF0ktQG5TDrNMbBovcNhIzDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"94c954e3c2c1f14e1a2f65af13b6a07e9fed098907cbc2340130c28d6c07cddd","last_reissued_at":"2026-06-09T00:04:45.248782Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T00:04:45.248782Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Trait-space Monitoring for Emergent Misalignment During Supervised Finetuning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CY"],"primary_cat":"cs.LG","authors_text":"Hal Daum\\'e III, Huy Nghiem, Sarah Wiegreffe, Sy-Tuyen Ho","submitted_at":"2026-05-31T04:28:21Z","abstract_excerpt":"Emergent misalignment (EM) occurs when narrow finetuning causes a model to behave dangerously outside the finetuning task. Standard training signals can miss this shift, making reliable detection costly if it depends on repeated behavioral evaluation. We ask whether emergent misalignment can instead be detected from internal representations during finetuning. Using seven alignment-relevant traits encoded as linear directions in activation space, we track representational drift across training checkpoints in four open-source 7-9B LLMs. EM-relevant drift concentrates on a low-dimensional axis th"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.07631","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.07631/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.07631","created_at":"2026-06-09T00:04:45.248842+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.07631v1","created_at":"2026-06-09T00:04:45.248842+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.07631","created_at":"2026-06-09T00:04:45.248842+00:00"},{"alias_kind":"pith_short_12","alias_value":"STEVJY6CYHYU","created_at":"2026-06-09T00:04:45.248842+00:00"},{"alias_kind":"pith_short_16","alias_value":"STEVJY6CYHYU4GRP","created_at":"2026-06-09T00:04:45.248842+00:00"},{"alias_kind":"pith_short_8","alias_value":"STEVJY6C","created_at":"2026-06-09T00:04:45.248842+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/STEVJY6CYHYU4GRPMWXRHNVAP2","json":"https://pith.science/pith/STEVJY6CYHYU4GRPMWXRHNVAP2.json","graph_json":"https://pith.science/api/pith-number/STEVJY6CYHYU4GRPMWXRHNVAP2/graph.json","events_json":"https://pith.science/api/pith-number/STEVJY6CYHYU4GRPMWXRHNVAP2/events.json","paper":"https://pith.science/paper/STEVJY6C"},"agent_actions":{"view_html":"https://pith.science/pith/STEVJY6CYHYU4GRPMWXRHNVAP2","download_json":"https://pith.science/pith/STEVJY6CYHYU4GRPMWXRHNVAP2.json","view_paper":"https://pith.science/paper/STEVJY6C","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.07631&json=true","fetch_graph":"https://pith.science/api/pith-number/STEVJY6CYHYU4GRPMWXRHNVAP2/graph.json","fetch_events":"https://pith.science/api/pith-number/STEVJY6CYHYU4GRPMWXRHNVAP2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/STEVJY6CYHYU4GRPMWXRHNVAP2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/STEVJY6CYHYU4GRPMWXRHNVAP2/action/storage_attestation","attest_author":"https://pith.science/pith/STEVJY6CYHYU4GRPMWXRHNVAP2/action/author_attestation","sign_citation":"https://pith.science/pith/STEVJY6CYHYU4GRPMWXRHNVAP2/action/citation_signature","submit_replication":"https://pith.science/pith/STEVJY6CYHYU4GRPMWXRHNVAP2/action/replication_record"}},"created_at":"2026-06-09T00:04:45.248842+00:00","updated_at":"2026-06-09T00:04:45.248842+00:00"}