{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2022:LO2EULQNDEZ4R33CWWGIP7XQNV","short_pith_number":"pith:LO2EULQN","canonical_record":{"source":{"id":"2202.12837","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2022-02-25T17:25:19Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"274847321a77ccf35bc2559050cd585919998fa6fb270700a1bc1d6087907a26","abstract_canon_sha256":"e4913c3c1646b32d0c2b76710aa070a3c523fa645f2f2b1702d4594d1213ece1"},"schema_version":"1.0"},"canonical_sha256":"5bb44a2e0d1933c8ef62b58c87fef06d4b52c2889253a1717819c66279b87b41","source":{"kind":"arxiv","id":"2202.12837","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2202.12837","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"arxiv_version","alias_value":"2202.12837v2","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2202.12837","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"pith_short_12","alias_value":"LO2EULQNDEZ4","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"LO2EULQNDEZ4R33C","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"LO2EULQN","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2022:LO2EULQNDEZ4R33CWWGIP7XQNV","target":"record","payload":{"canonical_record":{"source":{"id":"2202.12837","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2022-02-25T17:25:19Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"274847321a77ccf35bc2559050cd585919998fa6fb270700a1bc1d6087907a26","abstract_canon_sha256":"e4913c3c1646b32d0c2b76710aa070a3c523fa645f2f2b1702d4594d1213ece1"},"schema_version":"1.0"},"canonical_sha256":"5bb44a2e0d1933c8ef62b58c87fef06d4b52c2889253a1717819c66279b87b41","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:52.846091Z","signature_b64":"y8KSO5K+4JV91FEaGp5J8yR+oD75stLRtJJsmj0BOzoYdLd9osaposp195/wGuO22EgMALl/V8AX6fommT1KAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5bb44a2e0d1933c8ef62b58c87fef06d4b52c2889253a1717819c66279b87b41","last_reissued_at":"2026-05-17T23:38:52.845449Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:52.845449Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2202.12837","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:52Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"LXKKlV5oiqxH+cx3B9FTqsOX5goQUFtoDQWqlJP1vOPaWDjWGPW5DzUW1O0BkFSYhfzI4Pf0z3Wyuln+aipMBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T12:07:29.057673Z"},"content_sha256":"7763be096576eb86b90619146dcdac2c828341bad4930d8a4416cad0df1c819b","schema_version":"1.0","event_id":"sha256:7763be096576eb86b90619146dcdac2c828341bad4930d8a4416cad0df1c819b"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2022:LO2EULQNDEZ4R33CWWGIP7XQNV","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Rethinking the Role of Demonstrations: What Makes In-Context Learning Work?","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Randomly replacing labels in in-context demonstrations barely hurts performance on classification and multiple-choice tasks across many models.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Ari Holtzman, Hannaneh Hajishirzi, Luke Zettlemoyer, Mikel Artetxe, Mike Lewis, Sewon Min, Xinxi Lyu","submitted_at":"2022-02-25T17:25:19Z","abstract_excerpt":"Large language models (LMs) are able to in-context learn -- perform a new task via inference alone by conditioning on a few input-label pairs (demonstrations) and making predictions for new inputs. However, there has been little understanding of how the model learns and which aspects of the demonstrations contribute to end task performance. In this paper, we show that ground truth demonstrations are in fact not required -- randomly replacing labels in the demonstrations barely hurts performance on a range of classification and multi-choce tasks, consistently over 12 different models including "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"ground truth demonstrations are in fact not required -- randomly replacing labels in the demonstrations barely hurts performance on a range of classification and multi-choice tasks, consistently over 12 different models including GPT-3","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That randomly replacing labels does not introduce unintended statistical cues or that the chosen classification and multiple-choice tasks are representative of broader in-context learning behavior.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Randomly replacing labels in in-context demonstrations barely hurts performance, showing that label space, input distribution, and sequence format drive in-context learning more than ground-truth labels.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Randomly replacing labels in in-context demonstrations barely hurts performance on classification and multiple-choice tasks across many models.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"61d4dfbf8396fe4eeb64d839bf014ddd2b62c460dba2dcabf580a436aeb1b12e"},"source":{"id":"2202.12837","kind":"arxiv","version":2},"verdict":{"id":"d0753e83-3c25-4144-8164-18aa9ee7932b","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T09:47:47.546394Z","strongest_claim":"ground truth demonstrations are in fact not required -- randomly replacing labels in the demonstrations barely hurts performance on a range of classification and multi-choice tasks, consistently over 12 different models including GPT-3","one_line_summary":"Randomly replacing labels in in-context demonstrations barely hurts performance, showing that label space, input distribution, and sequence format drive in-context learning more than ground-truth labels.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That randomly replacing labels does not introduce unintended statistical cues or that the chosen classification and multiple-choice tasks are representative of broader in-context learning behavior.","pith_extraction_headline":"Randomly replacing labels in in-context demonstrations barely hurts performance on classification and multiple-choice tasks across many models."},"references":{"count":237,"sample":[{"doi":"","year":2011,"title":"Robust Disambiguation of Named Entities in Text","work_id":"3474f56b-b2f4-4a79-969b-ce696ce45a7a","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2019,"title":"CODAH : An Adversarially-Authored Question Answering Dataset for Common Sense","work_id":"03c831cd-ef72-48f3-9e66-66368b214d6c","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2005,"title":"Dolan, William B. and Brockett, Chris. Automatically Constructing a Corpus of Sentential Paraphrases. Proceedings of the Third International Workshop on Paraphrasing ( IWP 2005). 2005","work_id":"30b6b875-c3df-419a-8022-45756fa18613","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"DBpedia - A large-scale, multilingual knowledge base extracted from Wikipedia ,author=. Semantic Web ,year=","work_id":"742ea21d-e273-4d5b-af41-a056bd9f6844","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Abductive Commonsense Reasoning ,author=. ICLR ,year=","work_id":"696a87a7-3cc0-4345-b7e8-0a4048be94f4","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":237,"snapshot_sha256":"e0606b4aed025a4589039248ad69c666c1f7300df7f660b2b3fbf175f80bc8f5","internal_anchors":6},"formal_canon":{"evidence_count":2,"snapshot_sha256":"1a4bf141a32e3a1d3257f85429211d3c3a348e0ddeb32f97b209756120bf00e4"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"d0753e83-3c25-4144-8164-18aa9ee7932b"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:52Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"e7TWw7OkLArZWV0ztHdJQUcFB/5buUd5JvpqdtQHg8eItKYXH5S1RtF1OPVMkLMA1ItlVTp5FtJ4U1z5XTl/DQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T12:07:29.058207Z"},"content_sha256":"373c0086c469ad2d12df4b15ab749c70f835203034dc9262dabd838a767c2611","schema_version":"1.0","event_id":"sha256:373c0086c469ad2d12df4b15ab749c70f835203034dc9262dabd838a767c2611"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/LO2EULQNDEZ4R33CWWGIP7XQNV/bundle.json","state_url":"https://pith.science/pith/LO2EULQNDEZ4R33CWWGIP7XQNV/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/LO2EULQNDEZ4R33CWWGIP7XQNV/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T12:07:29Z","links":{"resolver":"https://pith.science/pith/LO2EULQNDEZ4R33CWWGIP7XQNV","bundle":"https://pith.science/pith/LO2EULQNDEZ4R33CWWGIP7XQNV/bundle.json","state":"https://pith.science/pith/LO2EULQNDEZ4R33CWWGIP7XQNV/state.json","well_known_bundle":"https://pith.science/.well-known/pith/LO2EULQNDEZ4R33CWWGIP7XQNV/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2022:LO2EULQNDEZ4R33CWWGIP7XQNV","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"e4913c3c1646b32d0c2b76710aa070a3c523fa645f2f2b1702d4594d1213ece1","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2022-02-25T17:25:19Z","title_canon_sha256":"274847321a77ccf35bc2559050cd585919998fa6fb270700a1bc1d6087907a26"},"schema_version":"1.0","source":{"id":"2202.12837","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2202.12837","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"arxiv_version","alias_value":"2202.12837v2","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2202.12837","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"pith_short_12","alias_value":"LO2EULQNDEZ4","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"LO2EULQNDEZ4R33C","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"LO2EULQN","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:373c0086c469ad2d12df4b15ab749c70f835203034dc9262dabd838a767c2611","target":"graph","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"ground truth demonstrations are in fact not required -- randomly replacing labels in the demonstrations barely hurts performance on a range of classification and multi-choice tasks, consistently over 12 different models including GPT-3"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That randomly replacing labels does not introduce unintended statistical cues or that the chosen classification and multiple-choice tasks are representative of broader in-context learning behavior."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Randomly replacing labels in in-context demonstrations barely hurts performance, showing that label space, input distribution, and sequence format drive in-context learning more than ground-truth labels."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Randomly replacing labels in in-context demonstrations barely hurts performance on classification and multiple-choice tasks across many models."}],"snapshot_sha256":"61d4dfbf8396fe4eeb64d839bf014ddd2b62c460dba2dcabf580a436aeb1b12e"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"1a4bf141a32e3a1d3257f85429211d3c3a348e0ddeb32f97b209756120bf00e4"},"paper":{"abstract_excerpt":"Large language models (LMs) are able to in-context learn -- perform a new task via inference alone by conditioning on a few input-label pairs (demonstrations) and making predictions for new inputs. However, there has been little understanding of how the model learns and which aspects of the demonstrations contribute to end task performance. In this paper, we show that ground truth demonstrations are in fact not required -- randomly replacing labels in the demonstrations barely hurts performance on a range of classification and multi-choce tasks, consistently over 12 different models including ","authors_text":"Ari Holtzman, Hannaneh Hajishirzi, Luke Zettlemoyer, Mikel Artetxe, Mike Lewis, Sewon Min, Xinxi Lyu","cross_cats":["cs.AI"],"headline":"Randomly replacing labels in in-context demonstrations barely hurts performance on classification and multiple-choice tasks across many models.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2022-02-25T17:25:19Z","title":"Rethinking the Role of Demonstrations: What Makes In-Context Learning Work?"},"references":{"count":237,"internal_anchors":6,"resolved_work":237,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Robust Disambiguation of Named Entities in Text","work_id":"3474f56b-b2f4-4a79-969b-ce696ce45a7a","year":2011},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"CODAH : An Adversarially-Authored Question Answering Dataset for Common Sense","work_id":"03c831cd-ef72-48f3-9e66-66368b214d6c","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Dolan, William B. and Brockett, Chris. Automatically Constructing a Corpus of Sentential Paraphrases. Proceedings of the Third International Workshop on Paraphrasing ( IWP 2005). 2005","work_id":"30b6b875-c3df-419a-8022-45756fa18613","year":2005},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"DBpedia - A large-scale, multilingual knowledge base extracted from Wikipedia ,author=. Semantic Web ,year=","work_id":"742ea21d-e273-4d5b-af41-a056bd9f6844","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Abductive Commonsense Reasoning ,author=. ICLR ,year=","work_id":"696a87a7-3cc0-4345-b7e8-0a4048be94f4","year":null}],"snapshot_sha256":"e0606b4aed025a4589039248ad69c666c1f7300df7f660b2b3fbf175f80bc8f5"},"source":{"id":"2202.12837","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T09:47:47.546394Z","id":"d0753e83-3c25-4144-8164-18aa9ee7932b","model_set":{"reader":"grok-4.3"},"one_line_summary":"Randomly replacing labels in in-context demonstrations barely hurts performance, showing that label space, input distribution, and sequence format drive in-context learning more than ground-truth labels.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Randomly replacing labels in in-context demonstrations barely hurts performance on classification and multiple-choice tasks across many models.","strongest_claim":"ground truth demonstrations are in fact not required -- randomly replacing labels in the demonstrations barely hurts performance on a range of classification and multi-choice tasks, consistently over 12 different models including GPT-3","weakest_assumption":"That randomly replacing labels does not introduce unintended statistical cues or that the chosen classification and multiple-choice tasks are representative of broader in-context learning behavior."}},"verdict_id":"d0753e83-3c25-4144-8164-18aa9ee7932b"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:7763be096576eb86b90619146dcdac2c828341bad4930d8a4416cad0df1c819b","target":"record","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"e4913c3c1646b32d0c2b76710aa070a3c523fa645f2f2b1702d4594d1213ece1","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2022-02-25T17:25:19Z","title_canon_sha256":"274847321a77ccf35bc2559050cd585919998fa6fb270700a1bc1d6087907a26"},"schema_version":"1.0","source":{"id":"2202.12837","kind":"arxiv","version":2}},"canonical_sha256":"5bb44a2e0d1933c8ef62b58c87fef06d4b52c2889253a1717819c66279b87b41","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"5bb44a2e0d1933c8ef62b58c87fef06d4b52c2889253a1717819c66279b87b41","first_computed_at":"2026-05-17T23:38:52.845449Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:52.845449Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"y8KSO5K+4JV91FEaGp5J8yR+oD75stLRtJJsmj0BOzoYdLd9osaposp195/wGuO22EgMALl/V8AX6fommT1KAA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:52.846091Z","signed_message":"canonical_sha256_bytes"},"source_id":"2202.12837","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:7763be096576eb86b90619146dcdac2c828341bad4930d8a4416cad0df1c819b","sha256:373c0086c469ad2d12df4b15ab749c70f835203034dc9262dabd838a767c2611"],"state_sha256":"5699d8a45cf2f3946bf344dbc94c5319ba0a69d5b75707ca29d27e6825653285"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"S8x2WlxO7cAvelkUcfpEr8vkXopxEeKP0zR/KEjzF25s7xjWcTZPGytHK+m7mjHf7UAp8DE8eDhO94mRqYa4BQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T12:07:29.060842Z","bundle_sha256":"c62338d13eef66fe224530656a22f23efa80b223cadb130ca60e54f56948e360"}}