{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:KIN7B4ATSQNOJWEPT4HK35ICM6","short_pith_number":"pith:KIN7B4AT","schema_version":"1.0","canonical_sha256":"521bf0f013941ae4d88f9f0eadf502678bf11b90cea406cf1661b4d94c8df7e7","source":{"kind":"arxiv","id":"2603.10044","version":2},"attestation_state":"computed","paper":{"title":"Safety Under Scaffolding: How Evaluation Conditions Shape Measured Safety","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.LG"],"primary_cat":"cs.SE","authors_text":"David Gringras","submitted_at":"2026-03-08T01:37:45Z","abstract_excerpt":"A safety score earned on a benchmark need not predict how the same model behaves once it is wrapped in an agentic scaffold the benchmark never tested. We ran six frontier models through four deployment configurations (direct API, ReAct, multi-agent critic, map-reduce delegation): N = 62,808 blinded, pre-registered, equivalence-tested evaluations across four safety benchmarks (BBQ, TruthfulQA, XSTest/OR-Bench, sycophancy), plus three supporting analyses.\n  ReAct and multi-agent scaffolds stay within a pre-registered +/-2 pp equivalence margin; map-reduce delegation degrades measured safety (NNH"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2603.10044","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-03-08T01:37:45Z","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"title_canon_sha256":"8325c3270969d5258186ae5386313663b39cdb5ad8f39814f91e8249b4b91842","abstract_canon_sha256":"ae4b764ea964f81bdbb2639735f270c6491cb7c98cbab629b0627fb335ba5a7c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-04T02:07:45.491671Z","signature_b64":"GD4rUUUy7+jKu4hw7Xo7+t93UobnD6GQpdbNpyNh6MACRHZAvNFo/5JhyCzwTHE9H3S4Dw0VfATYGAZZmqOmAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"521bf0f013941ae4d88f9f0eadf502678bf11b90cea406cf1661b4d94c8df7e7","last_reissued_at":"2026-06-04T02:07:45.490896Z","signature_status":"signed_v1","first_computed_at":"2026-06-04T02:07:45.490896Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Safety Under Scaffolding: How Evaluation Conditions Shape Measured Safety","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.LG"],"primary_cat":"cs.SE","authors_text":"David Gringras","submitted_at":"2026-03-08T01:37:45Z","abstract_excerpt":"A safety score earned on a benchmark need not predict how the same model behaves once it is wrapped in an agentic scaffold the benchmark never tested. We ran six frontier models through four deployment configurations (direct API, ReAct, multi-agent critic, map-reduce delegation): N = 62,808 blinded, pre-registered, equivalence-tested evaluations across four safety benchmarks (BBQ, TruthfulQA, XSTest/OR-Bench, sycophancy), plus three supporting analyses.\n  ReAct and multi-agent scaffolds stay within a pre-registered +/-2 pp equivalence margin; map-reduce delegation degrades measured safety (NNH"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.10044","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2603.10044/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2603.10044","created_at":"2026-06-04T02:07:45.491006+00:00"},{"alias_kind":"arxiv_version","alias_value":"2603.10044v2","created_at":"2026-06-04T02:07:45.491006+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.10044","created_at":"2026-06-04T02:07:45.491006+00:00"},{"alias_kind":"pith_short_12","alias_value":"KIN7B4ATSQNO","created_at":"2026-06-04T02:07:45.491006+00:00"},{"alias_kind":"pith_short_16","alias_value":"KIN7B4ATSQNOJWEP","created_at":"2026-06-04T02:07:45.491006+00:00"},{"alias_kind":"pith_short_8","alias_value":"KIN7B4AT","created_at":"2026-06-04T02:07:45.491006+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.10779","citing_title":"LITMUS: Benchmarking Behavioral Jailbreaks of LLM Agents in Real OS Environments","ref_index":17,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/KIN7B4ATSQNOJWEPT4HK35ICM6","json":"https://pith.science/pith/KIN7B4ATSQNOJWEPT4HK35ICM6.json","graph_json":"https://pith.science/api/pith-number/KIN7B4ATSQNOJWEPT4HK35ICM6/graph.json","events_json":"https://pith.science/api/pith-number/KIN7B4ATSQNOJWEPT4HK35ICM6/events.json","paper":"https://pith.science/paper/KIN7B4AT"},"agent_actions":{"view_html":"https://pith.science/pith/KIN7B4ATSQNOJWEPT4HK35ICM6","download_json":"https://pith.science/pith/KIN7B4ATSQNOJWEPT4HK35ICM6.json","view_paper":"https://pith.science/paper/KIN7B4AT","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2603.10044&json=true","fetch_graph":"https://pith.science/api/pith-number/KIN7B4ATSQNOJWEPT4HK35ICM6/graph.json","fetch_events":"https://pith.science/api/pith-number/KIN7B4ATSQNOJWEPT4HK35ICM6/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/KIN7B4ATSQNOJWEPT4HK35ICM6/action/timestamp_anchor","attest_storage":"https://pith.science/pith/KIN7B4ATSQNOJWEPT4HK35ICM6/action/storage_attestation","attest_author":"https://pith.science/pith/KIN7B4ATSQNOJWEPT4HK35ICM6/action/author_attestation","sign_citation":"https://pith.science/pith/KIN7B4ATSQNOJWEPT4HK35ICM6/action/citation_signature","submit_replication":"https://pith.science/pith/KIN7B4ATSQNOJWEPT4HK35ICM6/action/replication_record"}},"created_at":"2026-06-04T02:07:45.491006+00:00","updated_at":"2026-06-04T02:07:45.491006+00:00"}