{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:ZGTKTZFW6VAR2IZJADJARJK4TJ","short_pith_number":"pith:ZGTKTZFW","canonical_record":{"source":{"id":"2308.05374","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2023-08-10T06:43:44Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"e4f29685ef9212d331f35b161dfd4efe86e04c62c4d0faf6cdb9dac9031623f4","abstract_canon_sha256":"f486721c6f283b619343311b946661d598241a74b6d7b31ef1a7c3e8492341d3"},"schema_version":"1.0"},"canonical_sha256":"c9a6a9e4b6f5411d232900d208a55c9a7de412fd7489d4c2e8ab15a9219e1409","source":{"kind":"arxiv","id":"2308.05374","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2308.05374","created_at":"2026-05-17T23:38:12Z"},{"alias_kind":"arxiv_version","alias_value":"2308.05374v2","created_at":"2026-05-17T23:38:12Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2308.05374","created_at":"2026-05-17T23:38:12Z"},{"alias_kind":"pith_short_12","alias_value":"ZGTKTZFW6VAR","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"ZGTKTZFW6VAR2IZJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"ZGTKTZFW","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:ZGTKTZFW6VAR2IZJADJARJK4TJ","target":"record","payload":{"canonical_record":{"source":{"id":"2308.05374","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2023-08-10T06:43:44Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"e4f29685ef9212d331f35b161dfd4efe86e04c62c4d0faf6cdb9dac9031623f4","abstract_canon_sha256":"f486721c6f283b619343311b946661d598241a74b6d7b31ef1a7c3e8492341d3"},"schema_version":"1.0"},"canonical_sha256":"c9a6a9e4b6f5411d232900d208a55c9a7de412fd7489d4c2e8ab15a9219e1409","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:12.821267Z","signature_b64":"V0lqwxKY0ZcqinddUuWyfIupwATVdmtYNaClm0borAh+iIUcSN1QIL9ewWm7FKZ24qj12ueiYY9Qy09MidOBDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c9a6a9e4b6f5411d232900d208a55c9a7de412fd7489d4c2e8ab15a9219e1409","last_reissued_at":"2026-05-17T23:38:12.820356Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:12.820356Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2308.05374","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:12Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"tspJMiUl9zbZ6IG3LxiegZVI5H80NBp/N+gYxHtbZcEVT06kE+rqbx3pgYq3PqvXP8/0kzJiT2F3iEfmd4IFBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T21:30:04.011608Z"},"content_sha256":"a6bca85d888e0056131593bbd24a85f9f6cce4c101c026464b9c2d5adf48ebd9","schema_version":"1.0","event_id":"sha256:a6bca85d888e0056131593bbd24a85f9f6cce4c101c026464b9c2d5adf48ebd9"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:ZGTKTZFW6VAR2IZJADJARJK4TJ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Trustworthy LLMs: a Survey and Guideline for Evaluating Large Language Models' Alignment","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"A survey finds that more aligned LLMs generally achieve higher trustworthiness, though the gains differ across categories.","cross_cats":["cs.LG"],"primary_cat":"cs.AI","authors_text":"Hang Li, Hao Cheng, Jean-Francois Ton, Muhammad Faaiz Taufiq, Ruocheng Guo, Xiaoying Zhang, Yang Liu, Yegor Klochkov, Yuanshun Yao","submitted_at":"2023-08-10T06:43:44Z","abstract_excerpt":"Ensuring alignment, which refers to making models behave in accordance with human intentions [1,2], has become a critical task before deploying large language models (LLMs) in real-world applications. For instance, OpenAI devoted six months to iteratively aligning GPT-4 before its release [3]. However, a major challenge faced by practitioners is the lack of clear guidance on evaluating whether LLM outputs align with social norms, values, and regulations. This obstacle hinders systematic iteration and deployment of LLMs. To address this issue, this paper presents a comprehensive survey of key d"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"The measurement results indicate that, in general, more aligned models tend to perform better in terms of overall trustworthiness. However, the effectiveness of alignment varies across the different trustworthiness categories considered.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the seven categories and 29 sub-categories comprehensively capture trustworthiness and that the selected eight sub-categories plus the chosen measurement methods accurately reflect real-world alignment.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Survey organizes LLM trustworthiness into seven categories and 29 sub-categories, measures eight sub-categories on popular models, and finds that more aligned models generally score higher but with varying effectiveness.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A survey finds that more aligned LLMs generally achieve higher trustworthiness, though the gains differ across categories.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b3c0c23793df0f8c6173f470a100a0500014ec208030a0169927937bb1ec13a6"},"source":{"id":"2308.05374","kind":"arxiv","version":2},"verdict":{"id":"16967dac-bb6b-4eca-9c61-8bcb939a565a","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T22:26:47.271196Z","strongest_claim":"The measurement results indicate that, in general, more aligned models tend to perform better in terms of overall trustworthiness. However, the effectiveness of alignment varies across the different trustworthiness categories considered.","one_line_summary":"Survey organizes LLM trustworthiness into seven categories and 29 sub-categories, measures eight sub-categories on popular models, and finds that more aligned models generally score higher but with varying effectiveness.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the seven categories and 29 sub-categories comprehensively capture trustworthiness and that the selected eight sub-categories plus the chosen measurement methods accurately reflect real-world alignment.","pith_extraction_headline":"A survey finds that more aligned LLMs generally achieve higher trustworthiness, though the gains differ across categories."},"references":{"count":300,"sample":[{"doi":"","year":2022,"title":"Training language models to follow instructions with human feedback","work_id":"843d640e-e399-40c5-8e8f-789cae25da17","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Alignment of language agents","work_id":"2dc6ed25-0b66-42f5-b67e-eb7e67977011","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"OpenAI. Gpt-4. https://openai.com/research/gpt-4, 2023","work_id":"d644ff37-47a2-4b15-a9ab-83415b8a3a60","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"On the dangers of stochastic parrots: Can language models be too big? In Proceedings of the 2021 ACM conference on fairness, accountability, and transparency, pages 610–623, 2021","work_id":"3ad5196c-6f5c-4854-b3dd-8d67a2979292","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2019,"title":"Language models are unsupervised multitask learners","work_id":"5fa609b3-1203-4f0e-a526-0110cb3f8046","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":300,"snapshot_sha256":"e592caa9fdb93f6a85a7aec285226791d31ccf6db0821a84e9d67bab365dc226","internal_anchors":29},"formal_canon":{"evidence_count":2,"snapshot_sha256":"42472f81a16dfaaadc5fa68af5aae9630f67e9d4c39de6f58e42e446375e0c77"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"16967dac-bb6b-4eca-9c61-8bcb939a565a"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:12Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"/NIGLzuES3tMhFIvOy3kctpBrFczVgiOY3ZqLBbfCR6e8Kp0XyWC4rJro4yvnAX7inFXkk2O/uZ9jbheNaeDBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T21:30:04.012865Z"},"content_sha256":"99712a37310b03de854dfc31fc965efea9156d50b96f0a649ce0e8674f911e4b","schema_version":"1.0","event_id":"sha256:99712a37310b03de854dfc31fc965efea9156d50b96f0a649ce0e8674f911e4b"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/ZGTKTZFW6VAR2IZJADJARJK4TJ/bundle.json","state_url":"https://pith.science/pith/ZGTKTZFW6VAR2IZJADJARJK4TJ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/ZGTKTZFW6VAR2IZJADJARJK4TJ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T21:30:04Z","links":{"resolver":"https://pith.science/pith/ZGTKTZFW6VAR2IZJADJARJK4TJ","bundle":"https://pith.science/pith/ZGTKTZFW6VAR2IZJADJARJK4TJ/bundle.json","state":"https://pith.science/pith/ZGTKTZFW6VAR2IZJADJARJK4TJ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/ZGTKTZFW6VAR2IZJADJARJK4TJ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:ZGTKTZFW6VAR2IZJADJARJK4TJ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f486721c6f283b619343311b946661d598241a74b6d7b31ef1a7c3e8492341d3","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2023-08-10T06:43:44Z","title_canon_sha256":"e4f29685ef9212d331f35b161dfd4efe86e04c62c4d0faf6cdb9dac9031623f4"},"schema_version":"1.0","source":{"id":"2308.05374","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2308.05374","created_at":"2026-05-17T23:38:12Z"},{"alias_kind":"arxiv_version","alias_value":"2308.05374v2","created_at":"2026-05-17T23:38:12Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2308.05374","created_at":"2026-05-17T23:38:12Z"},{"alias_kind":"pith_short_12","alias_value":"ZGTKTZFW6VAR","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"ZGTKTZFW6VAR2IZJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"ZGTKTZFW","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:99712a37310b03de854dfc31fc965efea9156d50b96f0a649ce0e8674f911e4b","target":"graph","created_at":"2026-05-17T23:38:12Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"The measurement results indicate that, in general, more aligned models tend to perform better in terms of overall trustworthiness. However, the effectiveness of alignment varies across the different trustworthiness categories considered."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the seven categories and 29 sub-categories comprehensively capture trustworthiness and that the selected eight sub-categories plus the chosen measurement methods accurately reflect real-world alignment."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Survey organizes LLM trustworthiness into seven categories and 29 sub-categories, measures eight sub-categories on popular models, and finds that more aligned models generally score higher but with varying effectiveness."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A survey finds that more aligned LLMs generally achieve higher trustworthiness, though the gains differ across categories."}],"snapshot_sha256":"b3c0c23793df0f8c6173f470a100a0500014ec208030a0169927937bb1ec13a6"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"42472f81a16dfaaadc5fa68af5aae9630f67e9d4c39de6f58e42e446375e0c77"},"paper":{"abstract_excerpt":"Ensuring alignment, which refers to making models behave in accordance with human intentions [1,2], has become a critical task before deploying large language models (LLMs) in real-world applications. For instance, OpenAI devoted six months to iteratively aligning GPT-4 before its release [3]. However, a major challenge faced by practitioners is the lack of clear guidance on evaluating whether LLM outputs align with social norms, values, and regulations. This obstacle hinders systematic iteration and deployment of LLMs. To address this issue, this paper presents a comprehensive survey of key d","authors_text":"Hang Li, Hao Cheng, Jean-Francois Ton, Muhammad Faaiz Taufiq, Ruocheng Guo, Xiaoying Zhang, Yang Liu, Yegor Klochkov, Yuanshun Yao","cross_cats":["cs.LG"],"headline":"A survey finds that more aligned LLMs generally achieve higher trustworthiness, though the gains differ across categories.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2023-08-10T06:43:44Z","title":"Trustworthy LLMs: a Survey and Guideline for Evaluating Large Language Models' Alignment"},"references":{"count":300,"internal_anchors":29,"resolved_work":300,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Training language models to follow instructions with human feedback","work_id":"843d640e-e399-40c5-8e8f-789cae25da17","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Alignment of language agents","work_id":"2dc6ed25-0b66-42f5-b67e-eb7e67977011","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"OpenAI. Gpt-4. https://openai.com/research/gpt-4, 2023","work_id":"d644ff37-47a2-4b15-a9ab-83415b8a3a60","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"On the dangers of stochastic parrots: Can language models be too big? In Proceedings of the 2021 ACM conference on fairness, accountability, and transparency, pages 610–623, 2021","work_id":"3ad5196c-6f5c-4854-b3dd-8d67a2979292","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Language models are unsupervised multitask learners","work_id":"5fa609b3-1203-4f0e-a526-0110cb3f8046","year":2019}],"snapshot_sha256":"e592caa9fdb93f6a85a7aec285226791d31ccf6db0821a84e9d67bab365dc226"},"source":{"id":"2308.05374","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-17T22:26:47.271196Z","id":"16967dac-bb6b-4eca-9c61-8bcb939a565a","model_set":{"reader":"grok-4.3"},"one_line_summary":"Survey organizes LLM trustworthiness into seven categories and 29 sub-categories, measures eight sub-categories on popular models, and finds that more aligned models generally score higher but with varying effectiveness.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A survey finds that more aligned LLMs generally achieve higher trustworthiness, though the gains differ across categories.","strongest_claim":"The measurement results indicate that, in general, more aligned models tend to perform better in terms of overall trustworthiness. However, the effectiveness of alignment varies across the different trustworthiness categories considered.","weakest_assumption":"That the seven categories and 29 sub-categories comprehensively capture trustworthiness and that the selected eight sub-categories plus the chosen measurement methods accurately reflect real-world alignment."}},"verdict_id":"16967dac-bb6b-4eca-9c61-8bcb939a565a"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a6bca85d888e0056131593bbd24a85f9f6cce4c101c026464b9c2d5adf48ebd9","target":"record","created_at":"2026-05-17T23:38:12Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f486721c6f283b619343311b946661d598241a74b6d7b31ef1a7c3e8492341d3","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2023-08-10T06:43:44Z","title_canon_sha256":"e4f29685ef9212d331f35b161dfd4efe86e04c62c4d0faf6cdb9dac9031623f4"},"schema_version":"1.0","source":{"id":"2308.05374","kind":"arxiv","version":2}},"canonical_sha256":"c9a6a9e4b6f5411d232900d208a55c9a7de412fd7489d4c2e8ab15a9219e1409","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c9a6a9e4b6f5411d232900d208a55c9a7de412fd7489d4c2e8ab15a9219e1409","first_computed_at":"2026-05-17T23:38:12.820356Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:12.820356Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"V0lqwxKY0ZcqinddUuWyfIupwATVdmtYNaClm0borAh+iIUcSN1QIL9ewWm7FKZ24qj12ueiYY9Qy09MidOBDw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:12.821267Z","signed_message":"canonical_sha256_bytes"},"source_id":"2308.05374","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a6bca85d888e0056131593bbd24a85f9f6cce4c101c026464b9c2d5adf48ebd9","sha256:99712a37310b03de854dfc31fc965efea9156d50b96f0a649ce0e8674f911e4b"],"state_sha256":"b64d6ef8e1b0fd7a94221a56f87ca00ef981da28f49b0ad3c1cf3b8046632128"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"WNrRW0U8Qcd7WTxRv6SlYrg7Fgq1oaauNK275w13+MiMhzBYOJjM3WdFPI932FLQujPlkoY/wT0pRD5YiEzcDA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T21:30:04.018058Z","bundle_sha256":"825c0ab5659cbb8c9799aa68b9660512b3f2f0a42007d58666d7f453cc43f4dc"}}