{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2016:JJPYR32USAIOBUGL66S6VFIJSU","short_pith_number":"pith:JJPYR32U","canonical_record":{"source":{"id":"1603.08023","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-03-25T20:32:21Z","cross_cats_sorted":["cs.AI","cs.LG","cs.NE"],"title_canon_sha256":"37ad30cda3b3b1dccfdff4ab356e463bdc643ad1094f74fe2700c4ae4382844d","abstract_canon_sha256":"e31b998345d83a3693991c8ea1df962ca26a5ab1f6aee8cbfb693a0e04f62487"},"schema_version":"1.0"},"canonical_sha256":"4a5f88ef549010e0d0cbf7a5ea9509952cce143cec1835dae636157880446629","source":{"kind":"arxiv","id":"1603.08023","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1603.08023","created_at":"2026-05-18T00:53:34Z"},{"alias_kind":"arxiv_version","alias_value":"1603.08023v2","created_at":"2026-05-18T00:53:34Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1603.08023","created_at":"2026-05-18T00:53:34Z"},{"alias_kind":"pith_short_12","alias_value":"JJPYR32USAIO","created_at":"2026-05-18T12:30:25Z"},{"alias_kind":"pith_short_16","alias_value":"JJPYR32USAIOBUGL","created_at":"2026-05-18T12:30:25Z"},{"alias_kind":"pith_short_8","alias_value":"JJPYR32U","created_at":"2026-05-18T12:30:25Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2016:JJPYR32USAIOBUGL66S6VFIJSU","target":"record","payload":{"canonical_record":{"source":{"id":"1603.08023","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-03-25T20:32:21Z","cross_cats_sorted":["cs.AI","cs.LG","cs.NE"],"title_canon_sha256":"37ad30cda3b3b1dccfdff4ab356e463bdc643ad1094f74fe2700c4ae4382844d","abstract_canon_sha256":"e31b998345d83a3693991c8ea1df962ca26a5ab1f6aee8cbfb693a0e04f62487"},"schema_version":"1.0"},"canonical_sha256":"4a5f88ef549010e0d0cbf7a5ea9509952cce143cec1835dae636157880446629","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:53:34.438236Z","signature_b64":"IHr2TPUCw/tzwtHxC6rbinb14MwKj5Rg53D3lfYKI14RqLU4fhvnvppqXbbXFjZxFW0Q/7EEdsCt8P32asfSAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4a5f88ef549010e0d0cbf7a5ea9509952cce143cec1835dae636157880446629","last_reissued_at":"2026-05-18T00:53:34.437773Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:53:34.437773Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1603.08023","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:53:34Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"yUKUmJGDFYTUTJ4Ixxnf0imo6Xhx7NxOzc/vlmHPFgYZK/AOhl8bkwZHAuFb2ow8gKut4FfIU2W+rK5f1QIRDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T01:26:50.364878Z"},"content_sha256":"005eb7eb2c06e0ea827ea7ee9328273f2991f252cbd1d6435f36ae5c7db48962","schema_version":"1.0","event_id":"sha256:005eb7eb2c06e0ea827ea7ee9328273f2991f252cbd1d6435f36ae5c7db48962"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2016:JJPYR32USAIOBUGL66S6VFIJSU","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"How NOT To Evaluate Your Dialogue System: An Empirical Study of Unsupervised Evaluation Metrics for Dialogue Response Generation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG","cs.NE"],"primary_cat":"cs.CL","authors_text":"Chia-Wei Liu, Iulian V. Serban, Joelle Pineau, Laurent Charlin, Michael Noseworthy, Ryan Lowe","submitted_at":"2016-03-25T20:32:21Z","abstract_excerpt":"We investigate evaluation metrics for dialogue response generation systems where supervised labels, such as task completion, are not available. Recent works in response generation have adopted metrics from machine translation to compare a model's generated response to a single target response. We show that these metrics correlate very weakly with human judgements in the non-technical Twitter domain, and not at all in the technical Ubuntu domain. We provide quantitative and qualitative results highlighting specific weaknesses in existing metrics, and provide recommendations for future developme"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1603.08023","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:53:34Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"J8sUQT2Ihq8CfjnMFgZiPFDal3OQVtn64lz31e/s3oDoJ3AxypJyqy6DHOk9AMuJGsno6Hcr/b7kF/wLF96EAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T01:26:50.365261Z"},"content_sha256":"25d8bfbe94b5601ecf9b8a86c28d4f69d53b3b34176682e2027b5358c3ebc0db","schema_version":"1.0","event_id":"sha256:25d8bfbe94b5601ecf9b8a86c28d4f69d53b3b34176682e2027b5358c3ebc0db"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/JJPYR32USAIOBUGL66S6VFIJSU/bundle.json","state_url":"https://pith.science/pith/JJPYR32USAIOBUGL66S6VFIJSU/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/JJPYR32USAIOBUGL66S6VFIJSU/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T01:26:50Z","links":{"resolver":"https://pith.science/pith/JJPYR32USAIOBUGL66S6VFIJSU","bundle":"https://pith.science/pith/JJPYR32USAIOBUGL66S6VFIJSU/bundle.json","state":"https://pith.science/pith/JJPYR32USAIOBUGL66S6VFIJSU/state.json","well_known_bundle":"https://pith.science/.well-known/pith/JJPYR32USAIOBUGL66S6VFIJSU/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2016:JJPYR32USAIOBUGL66S6VFIJSU","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"e31b998345d83a3693991c8ea1df962ca26a5ab1f6aee8cbfb693a0e04f62487","cross_cats_sorted":["cs.AI","cs.LG","cs.NE"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-03-25T20:32:21Z","title_canon_sha256":"37ad30cda3b3b1dccfdff4ab356e463bdc643ad1094f74fe2700c4ae4382844d"},"schema_version":"1.0","source":{"id":"1603.08023","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1603.08023","created_at":"2026-05-18T00:53:34Z"},{"alias_kind":"arxiv_version","alias_value":"1603.08023v2","created_at":"2026-05-18T00:53:34Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1603.08023","created_at":"2026-05-18T00:53:34Z"},{"alias_kind":"pith_short_12","alias_value":"JJPYR32USAIO","created_at":"2026-05-18T12:30:25Z"},{"alias_kind":"pith_short_16","alias_value":"JJPYR32USAIOBUGL","created_at":"2026-05-18T12:30:25Z"},{"alias_kind":"pith_short_8","alias_value":"JJPYR32U","created_at":"2026-05-18T12:30:25Z"}],"graph_snapshots":[{"event_id":"sha256:25d8bfbe94b5601ecf9b8a86c28d4f69d53b3b34176682e2027b5358c3ebc0db","target":"graph","created_at":"2026-05-18T00:53:34Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"We investigate evaluation metrics for dialogue response generation systems where supervised labels, such as task completion, are not available. Recent works in response generation have adopted metrics from machine translation to compare a model's generated response to a single target response. We show that these metrics correlate very weakly with human judgements in the non-technical Twitter domain, and not at all in the technical Ubuntu domain. We provide quantitative and qualitative results highlighting specific weaknesses in existing metrics, and provide recommendations for future developme","authors_text":"Chia-Wei Liu, Iulian V. Serban, Joelle Pineau, Laurent Charlin, Michael Noseworthy, Ryan Lowe","cross_cats":["cs.AI","cs.LG","cs.NE"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-03-25T20:32:21Z","title":"How NOT To Evaluate Your Dialogue System: An Empirical Study of Unsupervised Evaluation Metrics for Dialogue Response Generation"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1603.08023","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:005eb7eb2c06e0ea827ea7ee9328273f2991f252cbd1d6435f36ae5c7db48962","target":"record","created_at":"2026-05-18T00:53:34Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"e31b998345d83a3693991c8ea1df962ca26a5ab1f6aee8cbfb693a0e04f62487","cross_cats_sorted":["cs.AI","cs.LG","cs.NE"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-03-25T20:32:21Z","title_canon_sha256":"37ad30cda3b3b1dccfdff4ab356e463bdc643ad1094f74fe2700c4ae4382844d"},"schema_version":"1.0","source":{"id":"1603.08023","kind":"arxiv","version":2}},"canonical_sha256":"4a5f88ef549010e0d0cbf7a5ea9509952cce143cec1835dae636157880446629","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"4a5f88ef549010e0d0cbf7a5ea9509952cce143cec1835dae636157880446629","first_computed_at":"2026-05-18T00:53:34.437773Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T00:53:34.437773Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"IHr2TPUCw/tzwtHxC6rbinb14MwKj5Rg53D3lfYKI14RqLU4fhvnvppqXbbXFjZxFW0Q/7EEdsCt8P32asfSAw==","signature_status":"signed_v1","signed_at":"2026-05-18T00:53:34.438236Z","signed_message":"canonical_sha256_bytes"},"source_id":"1603.08023","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:005eb7eb2c06e0ea827ea7ee9328273f2991f252cbd1d6435f36ae5c7db48962","sha256:25d8bfbe94b5601ecf9b8a86c28d4f69d53b3b34176682e2027b5358c3ebc0db"],"state_sha256":"61f04d822d2834766d2011ccdfc4c06f28f913e56a107960d713acba8a1ebd02"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ohkscjFeknqXNkeArl6QQjosOKjyUeaV4W+A1RBOpW/iTXRBRsQWg5T12KbOUtJcl4/kok+f1DCCROQc3twqDQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T01:26:50.367271Z","bundle_sha256":"f8454d2fc94c6ab48c1eca3c85e533daeb0e4bd31c97cf9ec144c7502858f02b"}}