{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:AZZ4Q7OZQ74JQBO3IEU5LOAV5Q","short_pith_number":"pith:AZZ4Q7OZ","schema_version":"1.0","canonical_sha256":"0673c87dd987f89805db4129d5b815ec18d67747892ffc146369af9c669eca7c","source":{"kind":"arxiv","id":"2605.27463","version":1},"attestation_state":"computed","paper":{"title":"When prompt perturbations break your A/B test: A valid statistical test for generative surveying","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.AP"],"primary_cat":"stat.ME","authors_text":"Carey Priebe, Hayden Helm","submitted_at":"2026-05-26T00:35:58Z","abstract_excerpt":"Generative surveying -- where collections of LLM-based personas provide feedback on messages -- has emerged as a cheap and scalable alternative to traditional market research. However, LLMs are sensitive to small variations in prompt design and conclusions drawn from generative surveys may depend on arbitrary phrasing choices. Controlling for this sensitivity requires including semantically equivalent perturbations in the analysis. In this paper, we show that standard hypothesis tests, including the sign test and Wilcoxon signed-rank test, are invalid under a statistical model for generative s"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.27463","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"stat.ME","submitted_at":"2026-05-26T00:35:58Z","cross_cats_sorted":["cs.AI","stat.AP"],"title_canon_sha256":"bb1e6183ebc930eeb10621c61ec3140509521c14456dbade8f5078286e8a3c56","abstract_canon_sha256":"ad4ebf925351752435c712e1dd29dc9a72f04d9bd58d454d9f411924852d42fe"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-28T00:05:19.959107Z","signature_b64":"j8FSy2dBRWXZ/x01KST7VGBwA3On1Crli63yuZFz5bPjzkS+gZAzuBChniWXciUDrqENi7VqttSwpikcmh0FAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0673c87dd987f89805db4129d5b815ec18d67747892ffc146369af9c669eca7c","last_reissued_at":"2026-05-28T00:05:19.958615Z","signature_status":"signed_v1","first_computed_at":"2026-05-28T00:05:19.958615Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"When prompt perturbations break your A/B test: A valid statistical test for generative surveying","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.AP"],"primary_cat":"stat.ME","authors_text":"Carey Priebe, Hayden Helm","submitted_at":"2026-05-26T00:35:58Z","abstract_excerpt":"Generative surveying -- where collections of LLM-based personas provide feedback on messages -- has emerged as a cheap and scalable alternative to traditional market research. However, LLMs are sensitive to small variations in prompt design and conclusions drawn from generative surveys may depend on arbitrary phrasing choices. Controlling for this sensitivity requires including semantically equivalent perturbations in the analysis. In this paper, we show that standard hypothesis tests, including the sign test and Wilcoxon signed-rank test, are invalid under a statistical model for generative s"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.27463","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.27463/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.27463","created_at":"2026-05-28T00:05:19.958689+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.27463v1","created_at":"2026-05-28T00:05:19.958689+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.27463","created_at":"2026-05-28T00:05:19.958689+00:00"},{"alias_kind":"pith_short_12","alias_value":"AZZ4Q7OZQ74J","created_at":"2026-05-28T00:05:19.958689+00:00"},{"alias_kind":"pith_short_16","alias_value":"AZZ4Q7OZQ74JQBO3","created_at":"2026-05-28T00:05:19.958689+00:00"},{"alias_kind":"pith_short_8","alias_value":"AZZ4Q7OZ","created_at":"2026-05-28T00:05:19.958689+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q","json":"https://pith.science/pith/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q.json","graph_json":"https://pith.science/api/pith-number/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q/graph.json","events_json":"https://pith.science/api/pith-number/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q/events.json","paper":"https://pith.science/paper/AZZ4Q7OZ"},"agent_actions":{"view_html":"https://pith.science/pith/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q","download_json":"https://pith.science/pith/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q.json","view_paper":"https://pith.science/paper/AZZ4Q7OZ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.27463&json=true","fetch_graph":"https://pith.science/api/pith-number/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q/graph.json","fetch_events":"https://pith.science/api/pith-number/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q/action/timestamp_anchor","attest_storage":"https://pith.science/pith/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q/action/storage_attestation","attest_author":"https://pith.science/pith/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q/action/author_attestation","sign_citation":"https://pith.science/pith/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q/action/citation_signature","submit_replication":"https://pith.science/pith/AZZ4Q7OZQ74JQBO3IEU5LOAV5Q/action/replication_record"}},"created_at":"2026-05-28T00:05:19.958689+00:00","updated_at":"2026-05-28T00:05:19.958689+00:00"}