{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:FSFSXM5FYWTQ6NAVWEAB4OU6FD","short_pith_number":"pith:FSFSXM5F","schema_version":"1.0","canonical_sha256":"2c8b2bb3a5c5a70f3415b1001e3a9e28d8bd989219ca96736bca4306351f003a","source":{"kind":"arxiv","id":"1904.02792","version":1},"attestation_state":"computed","paper":{"title":"Unifying Human and Statistical Evaluation for Natural Language Generation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.CL","authors_text":"Hugh Zhang, Percy Liang, Tatsunori B. Hashimoto","submitted_at":"2019-04-04T21:03:34Z","abstract_excerpt":"How can we measure whether a natural language generation system produces both high quality and diverse outputs? Human evaluation captures quality but not diversity, as it does not catch models that simply plagiarize from the training set. On the other hand, statistical evaluation (i.e., perplexity) captures diversity but not quality, as models that occasionally emit low quality samples would be insufficiently penalized. In this paper, we propose a unified framework which evaluates both diversity and quality, based on the optimal error rate of predicting whether a sentence is human- or machine-"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1904.02792","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-04-04T21:03:34Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"6b3bf31fb490162a5b85bc68bdfedbeacb40dc415ce98e3c8ca7679cdc7d9018","abstract_canon_sha256":"c773e9c23a5346c8823b69a905b3f5891b1dd7cbfe80ddcb92e8d124c41e8abc"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:49:19.628508Z","signature_b64":"LyGb53uG7qmlpKrqzoRTTzobm4/9IcKnInyqcQoVeiPLXUr3MwrkGViXfX2HIlT1DTDPRgAkaptvzh9U7NwMAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2c8b2bb3a5c5a70f3415b1001e3a9e28d8bd989219ca96736bca4306351f003a","last_reissued_at":"2026-05-17T23:49:19.628056Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:49:19.628056Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Unifying Human and Statistical Evaluation for Natural Language Generation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.CL","authors_text":"Hugh Zhang, Percy Liang, Tatsunori B. Hashimoto","submitted_at":"2019-04-04T21:03:34Z","abstract_excerpt":"How can we measure whether a natural language generation system produces both high quality and diverse outputs? Human evaluation captures quality but not diversity, as it does not catch models that simply plagiarize from the training set. On the other hand, statistical evaluation (i.e., perplexity) captures diversity but not quality, as models that occasionally emit low quality samples would be insufficiently penalized. In this paper, we propose a unified framework which evaluates both diversity and quality, based on the optimal error rate of predicting whether a sentence is human- or machine-"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1904.02792","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1904.02792","created_at":"2026-05-17T23:49:19.628123+00:00"},{"alias_kind":"arxiv_version","alias_value":"1904.02792v1","created_at":"2026-05-17T23:49:19.628123+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1904.02792","created_at":"2026-05-17T23:49:19.628123+00:00"},{"alias_kind":"pith_short_12","alias_value":"FSFSXM5FYWTQ","created_at":"2026-05-18T12:33:15.570797+00:00"},{"alias_kind":"pith_short_16","alias_value":"FSFSXM5FYWTQ6NAV","created_at":"2026-05-18T12:33:15.570797+00:00"},{"alias_kind":"pith_short_8","alias_value":"FSFSXM5F","created_at":"2026-05-18T12:33:15.570797+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2605.05950","citing_title":"Lightweight Stylistic Consistency Profiling: Robust Detection of LLM-Generated Textual Content for Multimedia Moderation","ref_index":30,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/FSFSXM5FYWTQ6NAVWEAB4OU6FD","json":"https://pith.science/pith/FSFSXM5FYWTQ6NAVWEAB4OU6FD.json","graph_json":"https://pith.science/api/pith-number/FSFSXM5FYWTQ6NAVWEAB4OU6FD/graph.json","events_json":"https://pith.science/api/pith-number/FSFSXM5FYWTQ6NAVWEAB4OU6FD/events.json","paper":"https://pith.science/paper/FSFSXM5F"},"agent_actions":{"view_html":"https://pith.science/pith/FSFSXM5FYWTQ6NAVWEAB4OU6FD","download_json":"https://pith.science/pith/FSFSXM5FYWTQ6NAVWEAB4OU6FD.json","view_paper":"https://pith.science/paper/FSFSXM5F","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1904.02792&json=true","fetch_graph":"https://pith.science/api/pith-number/FSFSXM5FYWTQ6NAVWEAB4OU6FD/graph.json","fetch_events":"https://pith.science/api/pith-number/FSFSXM5FYWTQ6NAVWEAB4OU6FD/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/FSFSXM5FYWTQ6NAVWEAB4OU6FD/action/timestamp_anchor","attest_storage":"https://pith.science/pith/FSFSXM5FYWTQ6NAVWEAB4OU6FD/action/storage_attestation","attest_author":"https://pith.science/pith/FSFSXM5FYWTQ6NAVWEAB4OU6FD/action/author_attestation","sign_citation":"https://pith.science/pith/FSFSXM5FYWTQ6NAVWEAB4OU6FD/action/citation_signature","submit_replication":"https://pith.science/pith/FSFSXM5FYWTQ6NAVWEAB4OU6FD/action/replication_record"}},"created_at":"2026-05-17T23:49:19.628123+00:00","updated_at":"2026-05-17T23:49:19.628123+00:00"}