{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:5IOQD7T65K3JDCLSTVMJU6GQRP","short_pith_number":"pith:5IOQD7T6","schema_version":"1.0","canonical_sha256":"ea1d01fe7eeab69189729d589a78d08bc5e583f0c2eabaa643bca302900840b9","source":{"kind":"arxiv","id":"1807.02202","version":1},"attestation_state":"computed","paper":{"title":"The price of debiasing automatic metrics in natural language evaluation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Arun Tejasvi Chaganty, Percy Liang, Stephen Mussman","submitted_at":"2018-07-06T00:11:27Z","abstract_excerpt":"For evaluating generation systems, automatic metrics such as BLEU cost nothing to run but have been shown to correlate poorly with human judgment, leading to systematic bias against certain model improvements. On the other hand, averaging human judgments, the unbiased gold standard, is often too expensive. In this paper, we use control variates to combine automatic metrics with human evaluation to obtain an unbiased estimator with lower cost than human evaluation alone. In practice, however, we obtain only a 7-13% cost reduction on evaluating summarization and open-response question answering "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1807.02202","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2018-07-06T00:11:27Z","cross_cats_sorted":[],"title_canon_sha256":"16e60b957706fde38ef4c36a3d0448c7cf2039b31ef12524cb0bd95bfa0db5da","abstract_canon_sha256":"9548f2ba6bdab579cff0a4f9ed857043a8acc8e5eec8f1db1cbf8559b7ed4692"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:11:22.836691Z","signature_b64":"aJ1FwUBJxQnrib5tPbXIcqMhQOaXWNUueMdlGGF6y81g2CeRTpG5YrzABf8FrDXguw46SA3EaGHNb2oKMT4JAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ea1d01fe7eeab69189729d589a78d08bc5e583f0c2eabaa643bca302900840b9","last_reissued_at":"2026-05-18T00:11:22.836195Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:11:22.836195Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"The price of debiasing automatic metrics in natural language evaluation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Arun Tejasvi Chaganty, Percy Liang, Stephen Mussman","submitted_at":"2018-07-06T00:11:27Z","abstract_excerpt":"For evaluating generation systems, automatic metrics such as BLEU cost nothing to run but have been shown to correlate poorly with human judgment, leading to systematic bias against certain model improvements. On the other hand, averaging human judgments, the unbiased gold standard, is often too expensive. In this paper, we use control variates to combine automatic metrics with human evaluation to obtain an unbiased estimator with lower cost than human evaluation alone. In practice, however, we obtain only a 7-13% cost reduction on evaluating summarization and open-response question answering "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1807.02202","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1807.02202","created_at":"2026-05-18T00:11:22.836268+00:00"},{"alias_kind":"arxiv_version","alias_value":"1807.02202v1","created_at":"2026-05-18T00:11:22.836268+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1807.02202","created_at":"2026-05-18T00:11:22.836268+00:00"},{"alias_kind":"pith_short_12","alias_value":"5IOQD7T65K3J","created_at":"2026-05-18T12:32:08.215937+00:00"},{"alias_kind":"pith_short_16","alias_value":"5IOQD7T65K3JDCLS","created_at":"2026-05-18T12:32:08.215937+00:00"},{"alias_kind":"pith_short_8","alias_value":"5IOQD7T6","created_at":"2026-05-18T12:32:08.215937+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2009.01325","citing_title":"Learning to summarize from human feedback","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18311","citing_title":"On the Importance and Evaluation of Narrativity in Natural Language AI Explanations","ref_index":30,"is_internal_anchor":false},{"citing_arxiv_id":"2604.17200","citing_title":"Calibrating Model-Based Evaluation Metrics for Summarization","ref_index":112,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/5IOQD7T65K3JDCLSTVMJU6GQRP","json":"https://pith.science/pith/5IOQD7T65K3JDCLSTVMJU6GQRP.json","graph_json":"https://pith.science/api/pith-number/5IOQD7T65K3JDCLSTVMJU6GQRP/graph.json","events_json":"https://pith.science/api/pith-number/5IOQD7T65K3JDCLSTVMJU6GQRP/events.json","paper":"https://pith.science/paper/5IOQD7T6"},"agent_actions":{"view_html":"https://pith.science/pith/5IOQD7T65K3JDCLSTVMJU6GQRP","download_json":"https://pith.science/pith/5IOQD7T65K3JDCLSTVMJU6GQRP.json","view_paper":"https://pith.science/paper/5IOQD7T6","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1807.02202&json=true","fetch_graph":"https://pith.science/api/pith-number/5IOQD7T65K3JDCLSTVMJU6GQRP/graph.json","fetch_events":"https://pith.science/api/pith-number/5IOQD7T65K3JDCLSTVMJU6GQRP/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/5IOQD7T65K3JDCLSTVMJU6GQRP/action/timestamp_anchor","attest_storage":"https://pith.science/pith/5IOQD7T65K3JDCLSTVMJU6GQRP/action/storage_attestation","attest_author":"https://pith.science/pith/5IOQD7T65K3JDCLSTVMJU6GQRP/action/author_attestation","sign_citation":"https://pith.science/pith/5IOQD7T65K3JDCLSTVMJU6GQRP/action/citation_signature","submit_replication":"https://pith.science/pith/5IOQD7T65K3JDCLSTVMJU6GQRP/action/replication_record"}},"created_at":"2026-05-18T00:11:22.836268+00:00","updated_at":"2026-05-18T00:11:22.836268+00:00"}