{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:TATCZOFLYXKDUXOV6FHBF3UMGM","short_pith_number":"pith:TATCZOFL","canonical_record":{"source":{"id":"2605.14517","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T08:00:23Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"5ac0bc7d8a0afe706e34908ca025e05f0c65720dad7e87369c1b63f1d7f4d1a0","abstract_canon_sha256":"19e0caffa6a0c51973879c6faeeb44b896269924f87986fbb66398bca62d093f"},"schema_version":"1.0"},"canonical_sha256":"98262cb8abc5d43a5dd5f14e12ee8c332f7f93f4288ae09227c2c39e386d8774","source":{"kind":"arxiv","id":"2605.14517","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14517","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14517v1","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14517","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"pith_short_12","alias_value":"TATCZOFLYXKD","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"TATCZOFLYXKDUXOV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"TATCZOFL","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:TATCZOFLYXKDUXOV6FHBF3UMGM","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14517","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T08:00:23Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"5ac0bc7d8a0afe706e34908ca025e05f0c65720dad7e87369c1b63f1d7f4d1a0","abstract_canon_sha256":"19e0caffa6a0c51973879c6faeeb44b896269924f87986fbb66398bca62d093f"},"schema_version":"1.0"},"canonical_sha256":"98262cb8abc5d43a5dd5f14e12ee8c332f7f93f4288ae09227c2c39e386d8774","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:06.111876Z","signature_b64":"9WJgLB6KHFwAd9vTHNrBhkvtJzWh8zFbjSkQDiKggsuCvhiaTKrwDX8o0lTeEaSWdZsajtyeW1f9FebiQ2C1BA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"98262cb8abc5d43a5dd5f14e12ee8c332f7f93f4288ae09227c2c39e386d8774","last_reissued_at":"2026-05-17T23:39:06.111116Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:06.111116Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14517","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"vYbKXREf6SoFxtQt38ZWStoddjbeitcYOo0pKa0h9nce4PAZ2WzQzXmeOAIFNVo+KHahM+7txRZ0NOsSR8TzCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T12:26:07.655659Z"},"content_sha256":"37ef6af42085d5b4a9dc16f4eb724bcc06a3d6f988d5205a8e0b41d86fff062c","schema_version":"1.0","event_id":"sha256:37ef6af42085d5b4a9dc16f4eb724bcc06a3d6f988d5205a8e0b41d86fff062c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:TATCZOFLYXKDUXOV6FHBF3UMGM","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Dimension-Level Intent Fidelity Evaluation for Large Language Models: Evidence from Structured Prompt Ablation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Many LLM outputs with perfect holistic scores still miss user intent on specific dimensions.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Gang Peng","submitted_at":"2026-05-14T08:00:23Z","abstract_excerpt":"Holistic evaluation scores capture overall output quality but do not distinguish whether a model reproduced the structural form of a user's request from whether it preserved the user's specific intent. We propose a dimension-level intent fidelity evaluation framework, applied here through a structured prompt ablation study across 2,880 outputs spanning three languages, three task domains, and six LLMs, that separately measures structural recovery and intent fidelity for each semantic dimension. This framework reveals a systematic structural-fidelity split: among Chinese-language outputs with c"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"among Chinese-language outputs with complete paired scores, 25.7% received perfect holistic alignment scores (GA=5) while exhibiting measurable dimensional intent deficits; among English-language outputs, this proportion rose to 58.6%.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the structured prompt ablation and proxy annotation reliably isolate prior inferability from default recoverability without introducing selection bias or confounding the human validation of split-zone outputs.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Dimension-level evaluation reveals that 25-58% of LLM outputs with perfect holistic scores still show measurable intent deficits across languages and domains.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Many LLM outputs with perfect holistic scores still miss user intent on specific dimensions.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"89d7d1a6b4dc2f4f62fec683ba35c833ccad8881351aee6336066576769bf52b"},"source":{"id":"2605.14517","kind":"arxiv","version":1},"verdict":{"id":"37c5806e-26a7-4719-a1b1-ebef4a6a6b74","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T01:46:31.109511Z","strongest_claim":"among Chinese-language outputs with complete paired scores, 25.7% received perfect holistic alignment scores (GA=5) while exhibiting measurable dimensional intent deficits; among English-language outputs, this proportion rose to 58.6%.","one_line_summary":"Dimension-level evaluation reveals that 25-58% of LLM outputs with perfect holistic scores still show measurable intent deficits across languages and domains.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the structured prompt ablation and proxy annotation reliably isolate prior inferability from default recoverability without introducing selection bias or confounding the human validation of split-zone outputs.","pith_extraction_headline":"Many LLM outputs with perfect holistic scores still miss user intent on specific dimensions."},"references":{"count":18,"sample":[{"doi":"","year":2023,"title":"Huang, L. et al. A survey on hallucination in large language models. ACM Comput. Surv. 57, 1–38 (2023)","work_id":"ea7fa718-e6a3-47ff-926a-97da57bf594d","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Ji, Z. et al. Survey of hallucination in natural language generation. ACM Comput. Surv. 55, 1–38 (2023)","work_id":"6b6fc1f1-634a-47a4-87cf-38d8fbbbe7ae","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"A Survey of Hallucination in Large Foundation Models","work_id":"1b84f221-37fa-403a-9bf4-1741910454bf","ref_index":3,"cited_arxiv_id":"2309.05922","is_internal_anchor":true},{"doi":"","year":2020,"title":"Lewis, P. et al. Retrieval -augmented generation for knowledge -intensive NLP tasks. NeurIPS (2020)","work_id":"ee4e961f-6219-4b1a-8423-cc0da4c47cbe","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Ouyang, L. et al. Training language models to follow instructions with human feed back. NeurIPS (2022)","work_id":"b165154e-efbf-45d6-af4c-ee656ceecf30","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":18,"snapshot_sha256":"fe047bdb7b2b8b98cd122270b67278961c3b832c2d4b1503026c6bc5f9daaba0","internal_anchors":4},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"37c5806e-26a7-4719-a1b1-ebef4a6a6b74"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"VZV1JoWCTP/hi+EFhyb+q3vLc+auhRLHMN51KxUp7fq5ABPB977X/Av5Kl7KTsQnDuD8IyuAdd4j7t1GlZ8mBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T12:26:07.656308Z"},"content_sha256":"3655508f9dfed5417a04149cc33b7290975b28223b43c0011a6a48bd8f9bf147","schema_version":"1.0","event_id":"sha256:3655508f9dfed5417a04149cc33b7290975b28223b43c0011a6a48bd8f9bf147"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/TATCZOFLYXKDUXOV6FHBF3UMGM/bundle.json","state_url":"https://pith.science/pith/TATCZOFLYXKDUXOV6FHBF3UMGM/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/TATCZOFLYXKDUXOV6FHBF3UMGM/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T12:26:07Z","links":{"resolver":"https://pith.science/pith/TATCZOFLYXKDUXOV6FHBF3UMGM","bundle":"https://pith.science/pith/TATCZOFLYXKDUXOV6FHBF3UMGM/bundle.json","state":"https://pith.science/pith/TATCZOFLYXKDUXOV6FHBF3UMGM/state.json","well_known_bundle":"https://pith.science/.well-known/pith/TATCZOFLYXKDUXOV6FHBF3UMGM/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:TATCZOFLYXKDUXOV6FHBF3UMGM","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"19e0caffa6a0c51973879c6faeeb44b896269924f87986fbb66398bca62d093f","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T08:00:23Z","title_canon_sha256":"5ac0bc7d8a0afe706e34908ca025e05f0c65720dad7e87369c1b63f1d7f4d1a0"},"schema_version":"1.0","source":{"id":"2605.14517","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14517","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14517v1","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14517","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"pith_short_12","alias_value":"TATCZOFLYXKD","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"TATCZOFLYXKDUXOV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"TATCZOFL","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:3655508f9dfed5417a04149cc33b7290975b28223b43c0011a6a48bd8f9bf147","target":"graph","created_at":"2026-05-17T23:39:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"among Chinese-language outputs with complete paired scores, 25.7% received perfect holistic alignment scores (GA=5) while exhibiting measurable dimensional intent deficits; among English-language outputs, this proportion rose to 58.6%."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the structured prompt ablation and proxy annotation reliably isolate prior inferability from default recoverability without introducing selection bias or confounding the human validation of split-zone outputs."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Dimension-level evaluation reveals that 25-58% of LLM outputs with perfect holistic scores still show measurable intent deficits across languages and domains."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Many LLM outputs with perfect holistic scores still miss user intent on specific dimensions."}],"snapshot_sha256":"89d7d1a6b4dc2f4f62fec683ba35c833ccad8881351aee6336066576769bf52b"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Holistic evaluation scores capture overall output quality but do not distinguish whether a model reproduced the structural form of a user's request from whether it preserved the user's specific intent. We propose a dimension-level intent fidelity evaluation framework, applied here through a structured prompt ablation study across 2,880 outputs spanning three languages, three task domains, and six LLMs, that separately measures structural recovery and intent fidelity for each semantic dimension. This framework reveals a systematic structural-fidelity split: among Chinese-language outputs with c","authors_text":"Gang Peng","cross_cats":["cs.AI"],"headline":"Many LLM outputs with perfect holistic scores still miss user intent on specific dimensions.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T08:00:23Z","title":"Dimension-Level Intent Fidelity Evaluation for Large Language Models: Evidence from Structured Prompt Ablation"},"references":{"count":18,"internal_anchors":4,"resolved_work":18,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Huang, L. et al. A survey on hallucination in large language models. ACM Comput. Surv. 57, 1–38 (2023)","work_id":"ea7fa718-e6a3-47ff-926a-97da57bf594d","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Ji, Z. et al. Survey of hallucination in natural language generation. ACM Comput. Surv. 55, 1–38 (2023)","work_id":"6b6fc1f1-634a-47a4-87cf-38d8fbbbe7ae","year":2023},{"cited_arxiv_id":"2309.05922","doi":"","is_internal_anchor":true,"ref_index":3,"title":"A Survey of Hallucination in Large Foundation Models","work_id":"1b84f221-37fa-403a-9bf4-1741910454bf","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Lewis, P. et al. Retrieval -augmented generation for knowledge -intensive NLP tasks. NeurIPS (2020)","work_id":"ee4e961f-6219-4b1a-8423-cc0da4c47cbe","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Ouyang, L. et al. Training language models to follow instructions with human feed back. NeurIPS (2022)","work_id":"b165154e-efbf-45d6-af4c-ee656ceecf30","year":2022}],"snapshot_sha256":"fe047bdb7b2b8b98cd122270b67278961c3b832c2d4b1503026c6bc5f9daaba0"},"source":{"id":"2605.14517","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T01:46:31.109511Z","id":"37c5806e-26a7-4719-a1b1-ebef4a6a6b74","model_set":{"reader":"grok-4.3"},"one_line_summary":"Dimension-level evaluation reveals that 25-58% of LLM outputs with perfect holistic scores still show measurable intent deficits across languages and domains.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Many LLM outputs with perfect holistic scores still miss user intent on specific dimensions.","strongest_claim":"among Chinese-language outputs with complete paired scores, 25.7% received perfect holistic alignment scores (GA=5) while exhibiting measurable dimensional intent deficits; among English-language outputs, this proportion rose to 58.6%.","weakest_assumption":"That the structured prompt ablation and proxy annotation reliably isolate prior inferability from default recoverability without introducing selection bias or confounding the human validation of split-zone outputs."}},"verdict_id":"37c5806e-26a7-4719-a1b1-ebef4a6a6b74"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:37ef6af42085d5b4a9dc16f4eb724bcc06a3d6f988d5205a8e0b41d86fff062c","target":"record","created_at":"2026-05-17T23:39:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"19e0caffa6a0c51973879c6faeeb44b896269924f87986fbb66398bca62d093f","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T08:00:23Z","title_canon_sha256":"5ac0bc7d8a0afe706e34908ca025e05f0c65720dad7e87369c1b63f1d7f4d1a0"},"schema_version":"1.0","source":{"id":"2605.14517","kind":"arxiv","version":1}},"canonical_sha256":"98262cb8abc5d43a5dd5f14e12ee8c332f7f93f4288ae09227c2c39e386d8774","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"98262cb8abc5d43a5dd5f14e12ee8c332f7f93f4288ae09227c2c39e386d8774","first_computed_at":"2026-05-17T23:39:06.111116Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:06.111116Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"9WJgLB6KHFwAd9vTHNrBhkvtJzWh8zFbjSkQDiKggsuCvhiaTKrwDX8o0lTeEaSWdZsajtyeW1f9FebiQ2C1BA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:06.111876Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14517","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:37ef6af42085d5b4a9dc16f4eb724bcc06a3d6f988d5205a8e0b41d86fff062c","sha256:3655508f9dfed5417a04149cc33b7290975b28223b43c0011a6a48bd8f9bf147"],"state_sha256":"de09b1f7a64fba0e43cabd4f8022cc7bfe4798b9de97799a1650f26df2623239"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"r0G7s5J9SZyS79XHDv9T1Lgsgpd3v9ndf8LUpWcxVTEtQbAvOI45wN2df1vM4Ya/v4p+H43wEowAik7qwtBwAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T12:26:07.659088Z","bundle_sha256":"28701a5000f348700a5f5d9d5a889eb96c60806418dd43fd35568d327f8caed9"}}