{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:IBAG4VXVKTFWVHETRH244WNFSB","short_pith_number":"pith:IBAG4VXV","canonical_record":{"source":{"id":"2606.07379","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-05T15:20:37Z","cross_cats_sorted":["cs.AI","cs.CL","stat.ME"],"title_canon_sha256":"53d3dedf50796ecd63677e18f766d699159376f4415bdf0c61999f7be802cdeb","abstract_canon_sha256":"335198ce7c321bbdbb17740f2a7be9963cd6bb376b963e787ad75b054b0c3553"},"schema_version":"1.0"},"canonical_sha256":"40406e56f554cb6a9c9389f5ce59a5907baeddd37e6de67499cc9450c9e01f57","source":{"kind":"arxiv","id":"2606.07379","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.07379","created_at":"2026-06-08T01:05:23Z"},{"alias_kind":"arxiv_version","alias_value":"2606.07379v1","created_at":"2026-06-08T01:05:23Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.07379","created_at":"2026-06-08T01:05:23Z"},{"alias_kind":"pith_short_12","alias_value":"IBAG4VXVKTFW","created_at":"2026-06-08T01:05:23Z"},{"alias_kind":"pith_short_16","alias_value":"IBAG4VXVKTFWVHET","created_at":"2026-06-08T01:05:23Z"},{"alias_kind":"pith_short_8","alias_value":"IBAG4VXV","created_at":"2026-06-08T01:05:23Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:IBAG4VXVKTFWVHETRH244WNFSB","target":"record","payload":{"canonical_record":{"source":{"id":"2606.07379","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-05T15:20:37Z","cross_cats_sorted":["cs.AI","cs.CL","stat.ME"],"title_canon_sha256":"53d3dedf50796ecd63677e18f766d699159376f4415bdf0c61999f7be802cdeb","abstract_canon_sha256":"335198ce7c321bbdbb17740f2a7be9963cd6bb376b963e787ad75b054b0c3553"},"schema_version":"1.0"},"canonical_sha256":"40406e56f554cb6a9c9389f5ce59a5907baeddd37e6de67499cc9450c9e01f57","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-08T01:05:23.549557Z","signature_b64":"6CKNyo0065tkpI31oykQpQAAuYPdbcnNk+RFrEKL7Kno43QRLoLAq3zXSjd0nf1rjnyzwMtI1heoVt5HDAIOBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"40406e56f554cb6a9c9389f5ce59a5907baeddd37e6de67499cc9450c9e01f57","last_reissued_at":"2026-06-08T01:05:23.548975Z","signature_status":"signed_v1","first_computed_at":"2026-06-08T01:05:23.548975Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2606.07379","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-08T01:05:23Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"//rKps4FXM1a5CNiR5aiwE5rZL+juuBvLknWTIORdItqgZGy/lTZGc2OtDUcSWNN/6nb1jHPvcWEUzNR2xqFCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-08T22:09:08.832828Z"},"content_sha256":"e9e35c18f5f8e45eef748ec17eb5a7b73081c7545367a15f26d5f8ab60019b75","schema_version":"1.0","event_id":"sha256:e9e35c18f5f8e45eef748ec17eb5a7b73081c7545367a15f26d5f8ab60019b75"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:IBAG4VXVKTFWVHETRH244WNFSB","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Do Coding Agents Deceive Us? Detecting and Preventing Cheating via Capped Evaluation with Randomized Tests","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","stat.ME"],"primary_cat":"cs.LG","authors_text":"Johannes Ackermann, Masashi Sugiyama, Nontawat Charoenphakdee, Soichiro Nishimori, Takashi Ishida, Thanawat Lodkaew","submitted_at":"2026-06-05T15:20:37Z","abstract_excerpt":"A growing failure mode in agent evaluation and training is that models can achieve high evaluation scores by exploiting shortcuts instead of solving the intended task, producing deceptive performance. This makes evaluation scores unreliable as measures of true task-solving ability. We propose CapCode, a framework for constructing coding datasets with randomized tests whose best achievable non-cheating performance is deliberately capped below one. This capped-performance design gives evaluation scores a clearer interpretation: scores substantially above the cap are implausible and therefore pro"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.07379","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.07379/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-08T01:05:23Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"2q6YERKFJYCrGknggt2hQohwaY5pzVkN920gGYstFgT+QdmMyJryezI2Mx4N4S5n4Q3vzg04xZ+ssFR5P9LNBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-08T22:09:08.833192Z"},"content_sha256":"5de08da4c40747e575d459d1aad9e5a99696e1a8054cb82c14853de5c487b5f1","schema_version":"1.0","event_id":"sha256:5de08da4c40747e575d459d1aad9e5a99696e1a8054cb82c14853de5c487b5f1"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/IBAG4VXVKTFWVHETRH244WNFSB/bundle.json","state_url":"https://pith.science/pith/IBAG4VXVKTFWVHETRH244WNFSB/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/IBAG4VXVKTFWVHETRH244WNFSB/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-08T22:09:08Z","links":{"resolver":"https://pith.science/pith/IBAG4VXVKTFWVHETRH244WNFSB","bundle":"https://pith.science/pith/IBAG4VXVKTFWVHETRH244WNFSB/bundle.json","state":"https://pith.science/pith/IBAG4VXVKTFWVHETRH244WNFSB/state.json","well_known_bundle":"https://pith.science/.well-known/pith/IBAG4VXVKTFWVHETRH244WNFSB/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:IBAG4VXVKTFWVHETRH244WNFSB","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"335198ce7c321bbdbb17740f2a7be9963cd6bb376b963e787ad75b054b0c3553","cross_cats_sorted":["cs.AI","cs.CL","stat.ME"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-05T15:20:37Z","title_canon_sha256":"53d3dedf50796ecd63677e18f766d699159376f4415bdf0c61999f7be802cdeb"},"schema_version":"1.0","source":{"id":"2606.07379","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.07379","created_at":"2026-06-08T01:05:23Z"},{"alias_kind":"arxiv_version","alias_value":"2606.07379v1","created_at":"2026-06-08T01:05:23Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.07379","created_at":"2026-06-08T01:05:23Z"},{"alias_kind":"pith_short_12","alias_value":"IBAG4VXVKTFW","created_at":"2026-06-08T01:05:23Z"},{"alias_kind":"pith_short_16","alias_value":"IBAG4VXVKTFWVHET","created_at":"2026-06-08T01:05:23Z"},{"alias_kind":"pith_short_8","alias_value":"IBAG4VXV","created_at":"2026-06-08T01:05:23Z"}],"graph_snapshots":[{"event_id":"sha256:5de08da4c40747e575d459d1aad9e5a99696e1a8054cb82c14853de5c487b5f1","target":"graph","created_at":"2026-06-08T01:05:23Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.07379/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"A growing failure mode in agent evaluation and training is that models can achieve high evaluation scores by exploiting shortcuts instead of solving the intended task, producing deceptive performance. This makes evaluation scores unreliable as measures of true task-solving ability. We propose CapCode, a framework for constructing coding datasets with randomized tests whose best achievable non-cheating performance is deliberately capped below one. This capped-performance design gives evaluation scores a clearer interpretation: scores substantially above the cap are implausible and therefore pro","authors_text":"Johannes Ackermann, Masashi Sugiyama, Nontawat Charoenphakdee, Soichiro Nishimori, Takashi Ishida, Thanawat Lodkaew","cross_cats":["cs.AI","cs.CL","stat.ME"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-05T15:20:37Z","title":"Do Coding Agents Deceive Us? Detecting and Preventing Cheating via Capped Evaluation with Randomized Tests"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.07379","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:e9e35c18f5f8e45eef748ec17eb5a7b73081c7545367a15f26d5f8ab60019b75","target":"record","created_at":"2026-06-08T01:05:23Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"335198ce7c321bbdbb17740f2a7be9963cd6bb376b963e787ad75b054b0c3553","cross_cats_sorted":["cs.AI","cs.CL","stat.ME"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-05T15:20:37Z","title_canon_sha256":"53d3dedf50796ecd63677e18f766d699159376f4415bdf0c61999f7be802cdeb"},"schema_version":"1.0","source":{"id":"2606.07379","kind":"arxiv","version":1}},"canonical_sha256":"40406e56f554cb6a9c9389f5ce59a5907baeddd37e6de67499cc9450c9e01f57","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"40406e56f554cb6a9c9389f5ce59a5907baeddd37e6de67499cc9450c9e01f57","first_computed_at":"2026-06-08T01:05:23.548975Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-08T01:05:23.548975Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"6CKNyo0065tkpI31oykQpQAAuYPdbcnNk+RFrEKL7Kno43QRLoLAq3zXSjd0nf1rjnyzwMtI1heoVt5HDAIOBw==","signature_status":"signed_v1","signed_at":"2026-06-08T01:05:23.549557Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.07379","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:e9e35c18f5f8e45eef748ec17eb5a7b73081c7545367a15f26d5f8ab60019b75","sha256:5de08da4c40747e575d459d1aad9e5a99696e1a8054cb82c14853de5c487b5f1"],"state_sha256":"03e485496cf3eb4e081b57777d1d1e04c83356113e10903c4739ad29a8fc4aed"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"bduoBYfZMQ8dEaqFvbQV5uq3jmlSqfv1Ill1XQnhCipJs4FJRt6dSnzqoYCnxo2SqG3qNxNsEGaZVe30w5nuDA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-08T22:09:08.835204Z","bundle_sha256":"4de9172068fcb69e9a37888893a8b17120635f2bbc5d39496314ecabacc5051e"}}