{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:VHKYNHJDNZYHV4EDWGX7NJXUY6","short_pith_number":"pith:VHKYNHJD","schema_version":"1.0","canonical_sha256":"a9d5869d236e707af083b1aff6a6f4c7938f9256a000642876f635d7c7155fdd","source":{"kind":"arxiv","id":"2407.02039","version":3},"attestation_state":"computed","paper":{"title":"Prompt Stability Scoring for Text Annotation with Large Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Christopher Barrie, Elli Palaiologou, Petter T\\\"ornberg","submitted_at":"2024-07-02T08:11:18Z","abstract_excerpt":"Researchers are increasingly using language models (LMs) for text annotation. These approaches rely only on a prompt telling the model to return a given output according to a set of instructions. The reproducibility of LM outputs may nonetheless be vulnerable to small changes in the prompt design. This calls into question the replicability of classification routines. To tackle this problem, researchers have typically tested a variety of semantically similar prompts to determine what we call ``prompt stability.\" These approaches remain ad-hoc and task specific. In this article, we propose a gen"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2407.02039","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2024-07-02T08:11:18Z","cross_cats_sorted":[],"title_canon_sha256":"320908346f3674c7f772cf3e9352e02e49592b526817aee5b5eae359ef4b7a3e","abstract_canon_sha256":"16072593368e70a394f421db7e898b0d19d34cbb0348d5571af5ae84c3bf4ae2"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:00:15.756644Z","signature_b64":"QJrxcal9Amj7/Ag1ADXLHNGHgLSKCW3W/nIZpZvDOt9md4rQ8NQMQSZt/X+ZsN2ziZ2mFZEnKCwm3lOlMxTkDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a9d5869d236e707af083b1aff6a6f4c7938f9256a000642876f635d7c7155fdd","last_reissued_at":"2026-05-20T00:00:15.755906Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:00:15.755906Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Prompt Stability Scoring for Text Annotation with Large Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Christopher Barrie, Elli Palaiologou, Petter T\\\"ornberg","submitted_at":"2024-07-02T08:11:18Z","abstract_excerpt":"Researchers are increasingly using language models (LMs) for text annotation. These approaches rely only on a prompt telling the model to return a given output according to a set of instructions. The reproducibility of LM outputs may nonetheless be vulnerable to small changes in the prompt design. This calls into question the replicability of classification routines. To tackle this problem, researchers have typically tested a variety of semantically similar prompts to determine what we call ``prompt stability.\" These approaches remain ad-hoc and task specific. In this article, we propose a gen"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2407.02039","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2407.02039/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2407.02039","created_at":"2026-05-20T00:00:15.756008+00:00"},{"alias_kind":"arxiv_version","alias_value":"2407.02039v3","created_at":"2026-05-20T00:00:15.756008+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2407.02039","created_at":"2026-05-20T00:00:15.756008+00:00"},{"alias_kind":"pith_short_12","alias_value":"VHKYNHJDNZYH","created_at":"2026-05-20T00:00:15.756008+00:00"},{"alias_kind":"pith_short_16","alias_value":"VHKYNHJDNZYHV4ED","created_at":"2026-05-20T00:00:15.756008+00:00"},{"alias_kind":"pith_short_8","alias_value":"VHKYNHJD","created_at":"2026-05-20T00:00:15.756008+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.13412","citing_title":"LLMs as annotators of credibility assessment in Danish asylum decisions: evaluating classification performance and errors beyond aggregated metrics","ref_index":11,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VHKYNHJDNZYHV4EDWGX7NJXUY6","json":"https://pith.science/pith/VHKYNHJDNZYHV4EDWGX7NJXUY6.json","graph_json":"https://pith.science/api/pith-number/VHKYNHJDNZYHV4EDWGX7NJXUY6/graph.json","events_json":"https://pith.science/api/pith-number/VHKYNHJDNZYHV4EDWGX7NJXUY6/events.json","paper":"https://pith.science/paper/VHKYNHJD"},"agent_actions":{"view_html":"https://pith.science/pith/VHKYNHJDNZYHV4EDWGX7NJXUY6","download_json":"https://pith.science/pith/VHKYNHJDNZYHV4EDWGX7NJXUY6.json","view_paper":"https://pith.science/paper/VHKYNHJD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2407.02039&json=true","fetch_graph":"https://pith.science/api/pith-number/VHKYNHJDNZYHV4EDWGX7NJXUY6/graph.json","fetch_events":"https://pith.science/api/pith-number/VHKYNHJDNZYHV4EDWGX7NJXUY6/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VHKYNHJDNZYHV4EDWGX7NJXUY6/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VHKYNHJDNZYHV4EDWGX7NJXUY6/action/storage_attestation","attest_author":"https://pith.science/pith/VHKYNHJDNZYHV4EDWGX7NJXUY6/action/author_attestation","sign_citation":"https://pith.science/pith/VHKYNHJDNZYHV4EDWGX7NJXUY6/action/citation_signature","submit_replication":"https://pith.science/pith/VHKYNHJDNZYHV4EDWGX7NJXUY6/action/replication_record"}},"created_at":"2026-05-20T00:00:15.756008+00:00","updated_at":"2026-05-20T00:00:15.756008+00:00"}