{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:IPZIX56FVQVL2OXHZHMS2OHRNT","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"118075b308bfe2d85f63508597282bc4489f77d3f7d7cc990bd032ea4eb9ba70","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T14:30:41Z","title_canon_sha256":"965d6650c7305554093d64c14df732b393b9e6c0db1a64ced164742a9d2176c8"},"schema_version":"1.0","source":{"id":"2605.13596","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13596","created_at":"2026-05-18T02:44:23Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13596v1","created_at":"2026-05-18T02:44:23Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13596","created_at":"2026-05-18T02:44:23Z"},{"alias_kind":"pith_short_12","alias_value":"IPZIX56FVQVL","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"IPZIX56FVQVL2OXH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"IPZIX56F","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:21d93f5dbccf0b52d754a57d5e26fa398a251b5b172927fc03e10cdcd35aecfb","target":"graph","created_at":"2026-05-18T02:44:23Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"both AEMs and LLM-as-a-judge evaluations correlate poorly with professional evaluations on creativity, with LLM-as-a-judge showing a systematic bias in favour of machine-translated texts and penalising creative and culturally appropriate solutions."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That detailed annotations by experienced professional literary translators constitute an objective and reliable ground truth for measuring creativity and translation quality across genres and modalities."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Automatic evaluation tools for literary translations correlate poorly with expert human judgments on creativity and exhibit bias favoring machine-translated texts."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Automatic evaluation metrics and LLM judges correlate poorly with professional translators on creativity in literary texts and bias toward machine outputs."}],"snapshot_sha256":"0f55518bf682a68175241d0a9116fc2265d3e705ad605523ef6c62f1f069aa88"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"This article investigates the performance of automatic evaluation metrics (AEMs) and LLM-as-a-judge evaluation on literary translation across multiple languages, genres, and translation modalities. The aim is to assess how well these tools align with professionals when evaluating translation, creativity (creative shifts & errors), and see if they can substitute laborious manual annotations. A dataset of literary translations across three modalities (human translation, machine translation, and post-editing), three genres and three language pairs was created and annotated in detail for creativit","authors_text":"Ana Guerberof Arenas, Kyo Gerrits, Rik van Noord","cross_cats":[],"headline":"Automatic evaluation metrics and LLM judges correlate poorly with professional translators on creativity in literary texts and bias toward machine outputs.","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T14:30:41Z","title":"Creativity Bias: How Machine Evaluation Struggles with Creativity in Literary Translations"},"references":{"count":130,"internal_anchors":0,"resolved_work":130,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"and Ullman, Jeffrey D","work_id":"9f4d095b-5cc6-4ad8-a102-3e3ec582f2e1","year":1972},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Interspeech 2006 --- Ninth International Conference on Spoken Language Processing , address=","work_id":"dca1746a-b0f4-4744-96a6-703b6c190994","year":2006},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Publications. 1983 , publisher=","work_id":"653ffea5-4aac-4cbc-ac57-775ffd7e5df3","year":1983},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Chandra and Dexter C","work_id":"30e59c52-947d-4f04-ac44-aa8b8453b2b3","year":1981},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Coling 2008, 22nd International Conference on Computational Linguistics , address=","work_id":"5e2dd450-37f6-41fa-a3fc-30ae4540c645","year":2008}],"snapshot_sha256":"df677d842fccc16f555bd598783cf37afe9ed33b9044b9285f069c74e23edfcb"},"source":{"id":"2605.13596","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T20:02:14.689918Z","id":"f076e1eb-f3a1-43df-ab63-f0e1a5031fc1","model_set":{"reader":"grok-4.3"},"one_line_summary":"Automatic evaluation tools for literary translations correlate poorly with expert human judgments on creativity and exhibit bias favoring machine-translated texts.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Automatic evaluation metrics and LLM judges correlate poorly with professional translators on creativity in literary texts and bias toward machine outputs.","strongest_claim":"both AEMs and LLM-as-a-judge evaluations correlate poorly with professional evaluations on creativity, with LLM-as-a-judge showing a systematic bias in favour of machine-translated texts and penalising creative and culturally appropriate solutions.","weakest_assumption":"That detailed annotations by experienced professional literary translators constitute an objective and reliable ground truth for measuring creativity and translation quality across genres and modalities."}},"verdict_id":"f076e1eb-f3a1-43df-ab63-f0e1a5031fc1"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:aa2505487c5c730b1eaa2a12b426019912baa18c4bedc2f5c5f88bead724df99","target":"record","created_at":"2026-05-18T02:44:23Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"118075b308bfe2d85f63508597282bc4489f77d3f7d7cc990bd032ea4eb9ba70","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T14:30:41Z","title_canon_sha256":"965d6650c7305554093d64c14df732b393b9e6c0db1a64ced164742a9d2176c8"},"schema_version":"1.0","source":{"id":"2605.13596","kind":"arxiv","version":1}},"canonical_sha256":"43f28bf7c5ac2abd3ae7c9d92d38f16cc78ab49d2e0a413c186acacf0aa539b1","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"43f28bf7c5ac2abd3ae7c9d92d38f16cc78ab49d2e0a413c186acacf0aa539b1","first_computed_at":"2026-05-18T02:44:23.010234Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:44:23.010234Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"WcTCVhBAhlAtGDhKGw2XUNtgx85zy/D3joqxgp1GUzd9hlmYWnkBNOkG0J0iHU9icR0MKPX52kZcQms74ZUrCQ==","signature_status":"signed_v1","signed_at":"2026-05-18T02:44:23.010692Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13596","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:aa2505487c5c730b1eaa2a12b426019912baa18c4bedc2f5c5f88bead724df99","sha256:21d93f5dbccf0b52d754a57d5e26fa398a251b5b172927fc03e10cdcd35aecfb"],"state_sha256":"b4f4705d6ff6ffc4f7f364e66f51c2431cfb62ad2a346c270a4be440a3d97c72"}