{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:PHQYLR7FLXVGT2IYQILCUAHHKN","short_pith_number":"pith:PHQYLR7F","schema_version":"1.0","canonical_sha256":"79e185c7e55dea69e91882162a00e753716bae0ce0553295251d69524ab648e0","source":{"kind":"arxiv","id":"2603.23485","version":2},"attestation_state":"computed","paper":{"title":"Failure of contextual invariance in large language models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CY"],"primary_cat":"cs.CL","authors_text":"Andrea Baronchelli, Ariel Flint, Luca Maria Aiello, Sagar Kumar","submitted_at":"2026-03-24T17:52:22Z","abstract_excerpt":"Standard evaluation practices assume that large language model (LLM) outputs are stable when prompts are embedded in contextually equivalent discourses. Here, we test this assumption in the setting of gender inference. Using a controlled pronoun selection task, we introduce minimal, theoretically uninformative discourse context and find that this induces large, systematic shifts in model outputs. Correlations with cultural gender stereotypes, present in decontextualized settings, weaken or disappear once context is introduced, while theoretically irrelevant features, such as the gender of a pr"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2603.23485","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-03-24T17:52:22Z","cross_cats_sorted":["cs.AI","cs.CY"],"title_canon_sha256":"d26fd3f457b8576d2632480dc1505ee574b9f7764e7653eb2b0e57eae33675eb","abstract_canon_sha256":"94b14159efe582aeb463a8564b4044cf8ef3c6566e6c697471afc79c57cc423b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T02:04:52.590583Z","signature_b64":"7sXqY60WCG1g1NC8wvfjlIsLX7kEUtRNqcntxv/Dv1Pbl3yQHByPxvf00I+vQwX2p4zXy2rR6FyE8KvWM1f9Cw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"79e185c7e55dea69e91882162a00e753716bae0ce0553295251d69524ab648e0","last_reissued_at":"2026-06-02T02:04:52.590078Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T02:04:52.590078Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Failure of contextual invariance in large language models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CY"],"primary_cat":"cs.CL","authors_text":"Andrea Baronchelli, Ariel Flint, Luca Maria Aiello, Sagar Kumar","submitted_at":"2026-03-24T17:52:22Z","abstract_excerpt":"Standard evaluation practices assume that large language model (LLM) outputs are stable when prompts are embedded in contextually equivalent discourses. Here, we test this assumption in the setting of gender inference. Using a controlled pronoun selection task, we introduce minimal, theoretically uninformative discourse context and find that this induces large, systematic shifts in model outputs. Correlations with cultural gender stereotypes, present in decontextualized settings, weaken or disappear once context is introduced, while theoretically irrelevant features, such as the gender of a pr"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.23485","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2603.23485/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2603.23485","created_at":"2026-06-02T02:04:52.590138+00:00"},{"alias_kind":"arxiv_version","alias_value":"2603.23485v2","created_at":"2026-06-02T02:04:52.590138+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.23485","created_at":"2026-06-02T02:04:52.590138+00:00"},{"alias_kind":"pith_short_12","alias_value":"PHQYLR7FLXVG","created_at":"2026-06-02T02:04:52.590138+00:00"},{"alias_kind":"pith_short_16","alias_value":"PHQYLR7FLXVGT2IY","created_at":"2026-06-02T02:04:52.590138+00:00"},{"alias_kind":"pith_short_8","alias_value":"PHQYLR7F","created_at":"2026-06-02T02:04:52.590138+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2604.16755","citing_title":"Machine individuality: Separating genuine idiosyncrasy from response bias in large language models","ref_index":15,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PHQYLR7FLXVGT2IYQILCUAHHKN","json":"https://pith.science/pith/PHQYLR7FLXVGT2IYQILCUAHHKN.json","graph_json":"https://pith.science/api/pith-number/PHQYLR7FLXVGT2IYQILCUAHHKN/graph.json","events_json":"https://pith.science/api/pith-number/PHQYLR7FLXVGT2IYQILCUAHHKN/events.json","paper":"https://pith.science/paper/PHQYLR7F"},"agent_actions":{"view_html":"https://pith.science/pith/PHQYLR7FLXVGT2IYQILCUAHHKN","download_json":"https://pith.science/pith/PHQYLR7FLXVGT2IYQILCUAHHKN.json","view_paper":"https://pith.science/paper/PHQYLR7F","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2603.23485&json=true","fetch_graph":"https://pith.science/api/pith-number/PHQYLR7FLXVGT2IYQILCUAHHKN/graph.json","fetch_events":"https://pith.science/api/pith-number/PHQYLR7FLXVGT2IYQILCUAHHKN/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PHQYLR7FLXVGT2IYQILCUAHHKN/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PHQYLR7FLXVGT2IYQILCUAHHKN/action/storage_attestation","attest_author":"https://pith.science/pith/PHQYLR7FLXVGT2IYQILCUAHHKN/action/author_attestation","sign_citation":"https://pith.science/pith/PHQYLR7FLXVGT2IYQILCUAHHKN/action/citation_signature","submit_replication":"https://pith.science/pith/PHQYLR7FLXVGT2IYQILCUAHHKN/action/replication_record"}},"created_at":"2026-06-02T02:04:52.590138+00:00","updated_at":"2026-06-02T02:04:52.590138+00:00"}