{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:6UG7QNNZDYOU574GK653U7DRKD","short_pith_number":"pith:6UG7QNNZ","canonical_record":{"source":{"id":"2302.04166","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-02-08T16:17:29Z","cross_cats_sorted":[],"title_canon_sha256":"fbe6d4804eed2dc343a3d0d23df63ede9cf07092a9013bf8f85857fc3b06ba7f","abstract_canon_sha256":"5a087b62a0a246edf049a858a2bb51dfda02d2af9147f8f2f29c0ca602acb3ec"},"schema_version":"1.0"},"canonical_sha256":"f50df835b91e1d4eff8657bbba7c7150d4fba63d1d8e0aa443e3eb3899ff1c48","source":{"kind":"arxiv","id":"2302.04166","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2302.04166","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2302.04166v2","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2302.04166","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"6UG7QNNZDYOU","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"6UG7QNNZDYOU574G","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"6UG7QNNZ","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:6UG7QNNZDYOU574GK653U7DRKD","target":"record","payload":{"canonical_record":{"source":{"id":"2302.04166","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-02-08T16:17:29Z","cross_cats_sorted":[],"title_canon_sha256":"fbe6d4804eed2dc343a3d0d23df63ede9cf07092a9013bf8f85857fc3b06ba7f","abstract_canon_sha256":"5a087b62a0a246edf049a858a2bb51dfda02d2af9147f8f2f29c0ca602acb3ec"},"schema_version":"1.0"},"canonical_sha256":"f50df835b91e1d4eff8657bbba7c7150d4fba63d1d8e0aa443e3eb3899ff1c48","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.552079Z","signature_b64":"9BgGoEtd6bD0zuhrG+Khx1u/vxlijEdwHbu5H+9fv4OJCBTvbzHa2BBcqBjp3YMjg0VrpMDPqMrYyA43nT2JDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f50df835b91e1d4eff8657bbba7c7150d4fba63d1d8e0aa443e3eb3899ff1c48","last_reissued_at":"2026-05-17T23:38:13.551525Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.551525Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2302.04166","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"x5h0u0ryuqOmK9HjJK1JOKO619UT9cfwbXkAU3/U1LqZwPMIINeX6LGMIhz1Yk4Kwfa2mbJf0axjzbYZIYTeDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T06:23:13.724033Z"},"content_sha256":"eff30dad8df717a68ae1eef05260692c410b537e9b34493dbf202864d7df9845","schema_version":"1.0","event_id":"sha256:eff30dad8df717a68ae1eef05260692c410b537e9b34493dbf202864d7df9845"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:6UG7QNNZDYOU574GK653U7DRKD","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"GPTScore: Evaluate as You Desire","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"GPTScore uses zero-shot prompting of generative models ranging from 80M to 175B parameters to evaluate text according to arbitrary natural language criteria, tested on 4 tasks, 22 aspects, and 37 datasets.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Jinlan Fu, Pengfei Liu, See-kiong Ng, Zhengbao Jiang","submitted_at":"2023-02-08T16:17:29Z","abstract_excerpt":"Generative Artificial Intelligence (AI) has enabled the development of sophisticated models that are capable of producing high-caliber text, images, and other outputs through the utilization of large pre-trained models. Nevertheless, assessing the quality of the generation is an even more arduous task than the generation itself, and this issue has not been given adequate consideration recently. This paper proposes a novel evaluation framework, GPTScore, which utilizes the emergent abilities (e.g., zero-shot instruction) of generative pre-trained models to score generated texts. There are 19 pr"},"claims":{"count":3,"items":[{"kind":"strongest_claim","text":"Experimental results on four text generation tasks, 22 evaluation aspects, and corresponding 37 datasets demonstrate that this approach can effectively allow us to achieve what one desires to evaluate for texts simply by natural language instructions.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the emergent zero-shot instruction-following abilities of the tested pre-trained models can produce scores that meaningfully reflect the desired evaluation criteria without task-specific fine-tuning or annotated samples.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"GPTScore uses zero-shot prompting of generative models ranging from 80M to 175B parameters to evaluate text according to arbitrary natural language criteria, tested on 4 tasks, 22 aspects, and 37 datasets.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"}],"snapshot_sha256":"bde13bac3bc5ae4bf8564f15f92524aa60810022602e12205bc3b64ce58f54d8"},"source":{"id":"2302.04166","kind":"arxiv","version":2},"verdict":{"id":"a3a32912-9185-4ea9-a20b-b29fb853ecfc","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T17:06:06.842651Z","strongest_claim":"Experimental results on four text generation tasks, 22 evaluation aspects, and corresponding 37 datasets demonstrate that this approach can effectively allow us to achieve what one desires to evaluate for texts simply by natural language instructions.","one_line_summary":"GPTScore uses zero-shot prompting of generative models ranging from 80M to 175B parameters to evaluate text according to arbitrary natural language criteria, tested on 4 tasks, 22 aspects, and 37 datasets.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the emergent zero-shot instruction-following abilities of the tested pre-trained models can produce scores that meaningfully reflect the desired evaluation criteria without task-specific fine-tuning or annotated samples.","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"651a82ecb0c5242a58809d718f6641ff889812b0957a4fb3adc1a49a31950172"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"a3a32912-9185-4ea9-a20b-b29fb853ecfc"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"/xiA0qkKPGvwEx4DKgcD+aC5970yGhU9JmcrIxD4vTK32snYC1Bi+yx+8O8GsFMOCGUVGMr6ccyssiIf1SSIBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T06:23:13.724629Z"},"content_sha256":"402417898b2459ee520a8f067b982a44896b1494b3a5163deb98eb4211b27894","schema_version":"1.0","event_id":"sha256:402417898b2459ee520a8f067b982a44896b1494b3a5163deb98eb4211b27894"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/6UG7QNNZDYOU574GK653U7DRKD/bundle.json","state_url":"https://pith.science/pith/6UG7QNNZDYOU574GK653U7DRKD/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/6UG7QNNZDYOU574GK653U7DRKD/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-24T06:23:13Z","links":{"resolver":"https://pith.science/pith/6UG7QNNZDYOU574GK653U7DRKD","bundle":"https://pith.science/pith/6UG7QNNZDYOU574GK653U7DRKD/bundle.json","state":"https://pith.science/pith/6UG7QNNZDYOU574GK653U7DRKD/state.json","well_known_bundle":"https://pith.science/.well-known/pith/6UG7QNNZDYOU574GK653U7DRKD/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:6UG7QNNZDYOU574GK653U7DRKD","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5a087b62a0a246edf049a858a2bb51dfda02d2af9147f8f2f29c0ca602acb3ec","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-02-08T16:17:29Z","title_canon_sha256":"fbe6d4804eed2dc343a3d0d23df63ede9cf07092a9013bf8f85857fc3b06ba7f"},"schema_version":"1.0","source":{"id":"2302.04166","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2302.04166","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2302.04166v2","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2302.04166","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"6UG7QNNZDYOU","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"6UG7QNNZDYOU574G","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"6UG7QNNZ","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:402417898b2459ee520a8f067b982a44896b1494b3a5163deb98eb4211b27894","target":"graph","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":3,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experimental results on four text generation tasks, 22 evaluation aspects, and corresponding 37 datasets demonstrate that this approach can effectively allow us to achieve what one desires to evaluate for texts simply by natural language instructions."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the emergent zero-shot instruction-following abilities of the tested pre-trained models can produce scores that meaningfully reflect the desired evaluation criteria without task-specific fine-tuning or annotated samples."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"GPTScore uses zero-shot prompting of generative models ranging from 80M to 175B parameters to evaluate text according to arbitrary natural language criteria, tested on 4 tasks, 22 aspects, and 37 datasets."}],"snapshot_sha256":"bde13bac3bc5ae4bf8564f15f92524aa60810022602e12205bc3b64ce58f54d8"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"651a82ecb0c5242a58809d718f6641ff889812b0957a4fb3adc1a49a31950172"},"paper":{"abstract_excerpt":"Generative Artificial Intelligence (AI) has enabled the development of sophisticated models that are capable of producing high-caliber text, images, and other outputs through the utilization of large pre-trained models. Nevertheless, assessing the quality of the generation is an even more arduous task than the generation itself, and this issue has not been given adequate consideration recently. This paper proposes a novel evaluation framework, GPTScore, which utilizes the emergent abilities (e.g., zero-shot instruction) of generative pre-trained models to score generated texts. There are 19 pr","authors_text":"Jinlan Fu, Pengfei Liu, See-kiong Ng, Zhengbao Jiang","cross_cats":[],"headline":"GPTScore uses zero-shot prompting of generative models ranging from 80M to 175B parameters to evaluate text according to arbitrary natural language criteria, tested on 4 tasks, 22 aspects, and 37 datasets.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-02-08T16:17:29Z","title":"GPTScore: Evaluate as You Desire"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2302.04166","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-17T17:06:06.842651Z","id":"a3a32912-9185-4ea9-a20b-b29fb853ecfc","model_set":{"reader":"grok-4.3"},"one_line_summary":"GPTScore uses zero-shot prompting of generative models ranging from 80M to 175B parameters to evaluate text according to arbitrary natural language criteria, tested on 4 tasks, 22 aspects, and 37 datasets.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"","strongest_claim":"Experimental results on four text generation tasks, 22 evaluation aspects, and corresponding 37 datasets demonstrate that this approach can effectively allow us to achieve what one desires to evaluate for texts simply by natural language instructions.","weakest_assumption":"That the emergent zero-shot instruction-following abilities of the tested pre-trained models can produce scores that meaningfully reflect the desired evaluation criteria without task-specific fine-tuning or annotated samples."}},"verdict_id":"a3a32912-9185-4ea9-a20b-b29fb853ecfc"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:eff30dad8df717a68ae1eef05260692c410b537e9b34493dbf202864d7df9845","target":"record","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5a087b62a0a246edf049a858a2bb51dfda02d2af9147f8f2f29c0ca602acb3ec","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-02-08T16:17:29Z","title_canon_sha256":"fbe6d4804eed2dc343a3d0d23df63ede9cf07092a9013bf8f85857fc3b06ba7f"},"schema_version":"1.0","source":{"id":"2302.04166","kind":"arxiv","version":2}},"canonical_sha256":"f50df835b91e1d4eff8657bbba7c7150d4fba63d1d8e0aa443e3eb3899ff1c48","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"f50df835b91e1d4eff8657bbba7c7150d4fba63d1d8e0aa443e3eb3899ff1c48","first_computed_at":"2026-05-17T23:38:13.551525Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:13.551525Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"9BgGoEtd6bD0zuhrG+Khx1u/vxlijEdwHbu5H+9fv4OJCBTvbzHa2BBcqBjp3YMjg0VrpMDPqMrYyA43nT2JDw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:13.552079Z","signed_message":"canonical_sha256_bytes"},"source_id":"2302.04166","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:eff30dad8df717a68ae1eef05260692c410b537e9b34493dbf202864d7df9845","sha256:402417898b2459ee520a8f067b982a44896b1494b3a5163deb98eb4211b27894"],"state_sha256":"8f0566f333ef9ea28cc8c0d29c15c4cd1651fae91ab7a2b393f4bd278ee5327c"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"KjnmTXQ/PwAmVBpSxWuzA4k+HMDXpdgVPzlNIC+XryufjxMCctSsI1WbyChz4rGS27/9xGc38uZM9xF6liR5AA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-24T06:23:13.728142Z","bundle_sha256":"40451d4f715e904bc0142b49df5690129eb98de34a5128b39799144fc972f77b"}}