{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:I4FI2JPHCXWERRKM2WWGSQLV3F","short_pith_number":"pith:I4FI2JPH","schema_version":"1.0","canonical_sha256":"470a8d25e715ec48c54cd5ac694175d9798a17ba7c66e061b9efebaa4e298406","source":{"kind":"arxiv","id":"2605.12530","version":1},"attestation_state":"computed","paper":{"title":"In-Situ Behavioral Evaluation for LLM Fairness, Not Standardized-Test Scores","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Standardized-test scores for LLM fairness are dominated by prompt wording choices unrelated to fairness itself.","cross_cats":["cs.AI","cs.CY"],"primary_cat":"cs.CL","authors_text":"Brando Miranda, Deonna Owens, Sang T. Truong, Sanmi Koyejo, Shreyas Sharma, Yibo Jacky Zhang, Zeyu Tang","submitted_at":"2026-04-21T18:38:50Z","abstract_excerpt":"LLM fairness should be evaluated through in-situ conversational behavior rather than standardized-test Q&A benchmarks. We show that the standardized-test paradigm can be structurally unreliable: surface-level prompt construction choices, although entirely orthogonal to the fairness question being tested, account for the majority of score variance, shift fairness conclusions in both the direction and the magnitude, and result in severe discordance in model rankings. We develop MAC-Fairness, a multi-agent conversational framework that embeds controlled variation factors into multi-round dialogue"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.12530","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-21T18:38:50Z","cross_cats_sorted":["cs.AI","cs.CY"],"title_canon_sha256":"ee053f754da8a53b2955526e7832ae7e9ce032b9073784015f36aecd87e4a021","abstract_canon_sha256":"7e006b07dc5ff1fda2bd23737db799bee303d478e7a420d9183a0b1e4c463eb9"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:10:02.692137Z","signature_b64":"fgCX12MdSe8pO6qzOT1opta7UufhtHPf1fTBKT5vosaC9DNvjhJWtnkEWTwh3uGxN45LXzllZnU+dCja8BkEBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"470a8d25e715ec48c54cd5ac694175d9798a17ba7c66e061b9efebaa4e298406","last_reissued_at":"2026-05-18T03:10:02.691430Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:10:02.691430Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"In-Situ Behavioral Evaluation for LLM Fairness, Not Standardized-Test Scores","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Standardized-test scores for LLM fairness are dominated by prompt wording choices unrelated to fairness itself.","cross_cats":["cs.AI","cs.CY"],"primary_cat":"cs.CL","authors_text":"Brando Miranda, Deonna Owens, Sang T. Truong, Sanmi Koyejo, Shreyas Sharma, Yibo Jacky Zhang, Zeyu Tang","submitted_at":"2026-04-21T18:38:50Z","abstract_excerpt":"LLM fairness should be evaluated through in-situ conversational behavior rather than standardized-test Q&A benchmarks. We show that the standardized-test paradigm can be structurally unreliable: surface-level prompt construction choices, although entirely orthogonal to the fairness question being tested, account for the majority of score variance, shift fairness conclusions in both the direction and the magnitude, and result in severe discordance in model rankings. We develop MAC-Fairness, a multi-agent conversational framework that embeds controlled variation factors into multi-round dialogue"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"surface-level prompt construction choices, although entirely orthogonal to the fairness question being tested, account for the majority of score variance, shift fairness conclusions in both the direction and the magnitude, and result in severe discordance in model rankings.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That conversational behavior observed in the multi-agent MAC-Fairness setup is a valid, generalizable proxy for real-world fairness that is not itself distorted by the artificial dialogue structure or agent identities chosen.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Standardized-test benchmarks for LLM fairness are unreliable because prompt wording alone drives most score variance and ranking changes, while a multi-agent conversational framework reveals consistent model-specific fairness behaviors across millions of dialogues.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Standardized-test scores for LLM fairness are dominated by prompt wording choices unrelated to fairness itself.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"bc6772160c81ecb424b4a0347e87fbf051ef68c960a3141aa70d81990361c2f5"},"source":{"id":"2605.12530","kind":"arxiv","version":1},"verdict":{"id":"e54b7aee-18ff-45d6-b22c-9fd04ece0aa8","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T21:26:41.125462Z","strongest_claim":"surface-level prompt construction choices, although entirely orthogonal to the fairness question being tested, account for the majority of score variance, shift fairness conclusions in both the direction and the magnitude, and result in severe discordance in model rankings.","one_line_summary":"Standardized-test benchmarks for LLM fairness are unreliable because prompt wording alone drives most score variance and ranking changes, while a multi-agent conversational framework reveals consistent model-specific fairness behaviors across millions of dialogues.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That conversational behavior observed in the multi-agent MAC-Fairness setup is a valid, generalizable proxy for real-world fairness that is not itself distorted by the artificial dialogue structure or agent identities chosen.","pith_extraction_headline":"Standardized-test scores for LLM fairness are dominated by prompt wording choices unrelated to fairness itself."},"references":{"count":15,"sample":[{"doi":"","year":null,"title":"Phi-4 Technical Report","work_id":"b6274271-7af9-4ee8-993b-ba1ba4205ba8","ref_index":1,"cited_arxiv_id":"2412.08905","is_internal_anchor":true},{"doi":"","year":null,"title":"Phi-4-Mini Technical Report: Compact yet Powerful Multimodal Language Models via Mixture-of-LoRAs","work_id":"83956045-536a-41ff-af02-b80e2a614eab","ref_index":2,"cited_arxiv_id":"2503.01743","is_internal_anchor":true},{"doi":"","year":2025,"title":"Orpp: Self-optimizing role-playing prompts to enhance language model capabilities","work_id":"bba2aa67-af14-47bf-9d5f-5e7e9764e633","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","ref_index":4,"cited_arxiv_id":"2407.21783","is_internal_anchor":true},{"doi":"","year":2020,"title":"Llm generated persona is a promise with a catch","work_id":"6f314463-0f42-4eea-a5c3-607e733d6afe","ref_index":5,"cited_arxiv_id":"2601.08584","is_internal_anchor":true}],"resolved_work":15,"snapshot_sha256":"51168f890b0bf5c18c00c15967b0a2400cf61cb8e3161ba8f21070cfe27529d7","internal_anchors":9},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.12530","created_at":"2026-05-18T03:10:02.691537+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.12530v1","created_at":"2026-05-18T03:10:02.691537+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12530","created_at":"2026-05-18T03:10:02.691537+00:00"},{"alias_kind":"pith_short_12","alias_value":"I4FI2JPHCXWE","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"I4FI2JPHCXWERRKM","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"I4FI2JPH","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/I4FI2JPHCXWERRKM2WWGSQLV3F","json":"https://pith.science/pith/I4FI2JPHCXWERRKM2WWGSQLV3F.json","graph_json":"https://pith.science/api/pith-number/I4FI2JPHCXWERRKM2WWGSQLV3F/graph.json","events_json":"https://pith.science/api/pith-number/I4FI2JPHCXWERRKM2WWGSQLV3F/events.json","paper":"https://pith.science/paper/I4FI2JPH"},"agent_actions":{"view_html":"https://pith.science/pith/I4FI2JPHCXWERRKM2WWGSQLV3F","download_json":"https://pith.science/pith/I4FI2JPHCXWERRKM2WWGSQLV3F.json","view_paper":"https://pith.science/paper/I4FI2JPH","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.12530&json=true","fetch_graph":"https://pith.science/api/pith-number/I4FI2JPHCXWERRKM2WWGSQLV3F/graph.json","fetch_events":"https://pith.science/api/pith-number/I4FI2JPHCXWERRKM2WWGSQLV3F/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/I4FI2JPHCXWERRKM2WWGSQLV3F/action/timestamp_anchor","attest_storage":"https://pith.science/pith/I4FI2JPHCXWERRKM2WWGSQLV3F/action/storage_attestation","attest_author":"https://pith.science/pith/I4FI2JPHCXWERRKM2WWGSQLV3F/action/author_attestation","sign_citation":"https://pith.science/pith/I4FI2JPHCXWERRKM2WWGSQLV3F/action/citation_signature","submit_replication":"https://pith.science/pith/I4FI2JPHCXWERRKM2WWGSQLV3F/action/replication_record"}},"created_at":"2026-05-18T03:10:02.691537+00:00","updated_at":"2026-05-18T03:10:02.691537+00:00"}