{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:VBBZK2N2X2ROQ2X2HU4BZTSCOZ","short_pith_number":"pith:VBBZK2N2","schema_version":"1.0","canonical_sha256":"a8439569babea2e86afa3d381cce42764f0474e727ac53f82b0f85c88b5d4288","source":{"kind":"arxiv","id":"2509.22510","version":3},"attestation_state":"computed","paper":{"title":"We Think, Therefore We Align LLMs to Helpful, Harmless and Honest Before They Go Wrong","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Gautam Siddharth Kashyap, Mark Dras, Usman Naseem","submitted_at":"2025-09-26T15:52:21Z","abstract_excerpt":"Alignment of Large Language Models (LLMs) is the ability to satisfy desired objectives during generation, which is critical for trustworthy deployment. In practice, alignment is often operationalized through multiple objectives such as Helpfulness, Harmlessness, and Honesty (HHH). Prior works study alignment via steering vectors in standard Transformer decoders but treat objectives in isolation, where optimizing a single objective can overwrite others, leading to interference. Recent works attempt to address this limitation by extending steering to a 1-to-N Transformer setting by replicating r"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2509.22510","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-09-26T15:52:21Z","cross_cats_sorted":[],"title_canon_sha256":"22ec9a0cd5620de7bfba31d65efec72a852705413eaeee7f573be515043c5834","abstract_canon_sha256":"12990bea7ee7feea1a2bedce3908e945aaa882670dfb43cb86b6c6685c84573b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:02:06.825894Z","signature_b64":"sW4rEHDdniSKNezjhnTT0EV9gGqItefrCdN231DG27kMLcYQY+qJFKGgxew8nE+F6vAbSDhAUXqlGUWFcz8iDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a8439569babea2e86afa3d381cce42764f0474e727ac53f82b0f85c88b5d4288","last_reissued_at":"2026-05-20T00:02:06.825187Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:02:06.825187Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"We Think, Therefore We Align LLMs to Helpful, Harmless and Honest Before They Go Wrong","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Gautam Siddharth Kashyap, Mark Dras, Usman Naseem","submitted_at":"2025-09-26T15:52:21Z","abstract_excerpt":"Alignment of Large Language Models (LLMs) is the ability to satisfy desired objectives during generation, which is critical for trustworthy deployment. In practice, alignment is often operationalized through multiple objectives such as Helpfulness, Harmlessness, and Honesty (HHH). Prior works study alignment via steering vectors in standard Transformer decoders but treat objectives in isolation, where optimizing a single objective can overwrite others, leading to interference. Recent works attempt to address this limitation by extending steering to a 1-to-N Transformer setting by replicating r"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2509.22510","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2509.22510/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2509.22510","created_at":"2026-05-20T00:02:06.825295+00:00"},{"alias_kind":"arxiv_version","alias_value":"2509.22510v3","created_at":"2026-05-20T00:02:06.825295+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.22510","created_at":"2026-05-20T00:02:06.825295+00:00"},{"alias_kind":"pith_short_12","alias_value":"VBBZK2N2X2RO","created_at":"2026-05-20T00:02:06.825295+00:00"},{"alias_kind":"pith_short_16","alias_value":"VBBZK2N2X2ROQ2X2","created_at":"2026-05-20T00:02:06.825295+00:00"},{"alias_kind":"pith_short_8","alias_value":"VBBZK2N2","created_at":"2026-05-20T00:02:06.825295+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VBBZK2N2X2ROQ2X2HU4BZTSCOZ","json":"https://pith.science/pith/VBBZK2N2X2ROQ2X2HU4BZTSCOZ.json","graph_json":"https://pith.science/api/pith-number/VBBZK2N2X2ROQ2X2HU4BZTSCOZ/graph.json","events_json":"https://pith.science/api/pith-number/VBBZK2N2X2ROQ2X2HU4BZTSCOZ/events.json","paper":"https://pith.science/paper/VBBZK2N2"},"agent_actions":{"view_html":"https://pith.science/pith/VBBZK2N2X2ROQ2X2HU4BZTSCOZ","download_json":"https://pith.science/pith/VBBZK2N2X2ROQ2X2HU4BZTSCOZ.json","view_paper":"https://pith.science/paper/VBBZK2N2","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2509.22510&json=true","fetch_graph":"https://pith.science/api/pith-number/VBBZK2N2X2ROQ2X2HU4BZTSCOZ/graph.json","fetch_events":"https://pith.science/api/pith-number/VBBZK2N2X2ROQ2X2HU4BZTSCOZ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VBBZK2N2X2ROQ2X2HU4BZTSCOZ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VBBZK2N2X2ROQ2X2HU4BZTSCOZ/action/storage_attestation","attest_author":"https://pith.science/pith/VBBZK2N2X2ROQ2X2HU4BZTSCOZ/action/author_attestation","sign_citation":"https://pith.science/pith/VBBZK2N2X2ROQ2X2HU4BZTSCOZ/action/citation_signature","submit_replication":"https://pith.science/pith/VBBZK2N2X2ROQ2X2HU4BZTSCOZ/action/replication_record"}},"created_at":"2026-05-20T00:02:06.825295+00:00","updated_at":"2026-05-20T00:02:06.825295+00:00"}