{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:XGE6GUEMKDUZEL3N6ETNMDGFXY","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"00464b526e06fe7a5deb6c685847acc2e7c66430950971527412607535ba97ec","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-10T14:12:19Z","title_canon_sha256":"97d717a8b90ca94a1fc83151bdeef9d885d405516821a651607cb98dfee4c89a"},"schema_version":"1.0","source":{"id":"2606.12117","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.12117","created_at":"2026-06-11T01:10:49Z"},{"alias_kind":"arxiv_version","alias_value":"2606.12117v1","created_at":"2026-06-11T01:10:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.12117","created_at":"2026-06-11T01:10:49Z"},{"alias_kind":"pith_short_12","alias_value":"XGE6GUEMKDUZ","created_at":"2026-06-11T01:10:49Z"},{"alias_kind":"pith_short_16","alias_value":"XGE6GUEMKDUZEL3N","created_at":"2026-06-11T01:10:49Z"},{"alias_kind":"pith_short_8","alias_value":"XGE6GUEM","created_at":"2026-06-11T01:10:49Z"}],"graph_snapshots":[{"event_id":"sha256:c72e61141d126122274102985ec4239b36d2311544d40c1f062e45a14d7af29c","target":"graph","created_at":"2026-06-11T01:10:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.12117/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Benchmark scores often misrepresent a large language model's (LLM's) knowledge, because they rely, e.g., on the model's ability to follow specific formatting requirements. This especially penalizes base models that may know the correct answers but lack the ability -- typically introduced in post-training -- to structure them as instructed. To overcome this, we propose soft-prompt tuning, an efficient, fair, and architecture-agnostic model evaluation. By optimizing only 10 soft-prompt vectors (roughly 0.0006% parameters for a 7B model) over a short tuning period, we adapt models to specific ben","authors_text":"Bastian Boll, Bj\\\"orn Deiseroth, Kristian Kersting, Letitia Parcalabescu, Selen Erkan","cross_cats":["cs.AI"],"headline":"","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-10T14:12:19Z","title":"Soft-Prompt Tuning for Fair and Efficient LLM Benchmark Evaluation"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.12117","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:568c68287263e3d8ea835925be26c9a526c95677895cb5add58bc49f5620da74","target":"record","created_at":"2026-06-11T01:10:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"00464b526e06fe7a5deb6c685847acc2e7c66430950971527412607535ba97ec","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-10T14:12:19Z","title_canon_sha256":"97d717a8b90ca94a1fc83151bdeef9d885d405516821a651607cb98dfee4c89a"},"schema_version":"1.0","source":{"id":"2606.12117","kind":"arxiv","version":1}},"canonical_sha256":"b989e3508c50e9922f6df126d60cc5be3d9454948f81088b34cacffecb5f0a7f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b989e3508c50e9922f6df126d60cc5be3d9454948f81088b34cacffecb5f0a7f","first_computed_at":"2026-06-11T01:10:49.313649Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-11T01:10:49.313649Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"1lj85gHQ29I7XYIp1ZvUmdVMN0yp70zWPCK0MlQarFemsb1uGpcXOyRsSi69l4pIo4fC+bAI7SUfijAuB+rhCw==","signature_status":"signed_v1","signed_at":"2026-06-11T01:10:49.314542Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.12117","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:568c68287263e3d8ea835925be26c9a526c95677895cb5add58bc49f5620da74","sha256:c72e61141d126122274102985ec4239b36d2311544d40c1f062e45a14d7af29c"],"state_sha256":"3da1c24d6213d09eaf0cfa4104f6a0a5fcf7de27bbaca9f0bdc0fbeec7eabfa8"}