{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:F7ZF6IGC6JAZERSPTT2SWOTWHV","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"3849567f6c1031e6853bd81d3daa2a59ba7f9e6980031ea6d4b1756c3d05482e","cross_cats_sorted":["cs.AI","stat.AP","stat.ML"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-12-02T05:59:18Z","title_canon_sha256":"800d60c597251a084f57b365597086af314f58ee569f06d2c8004c76a0efbe66"},"schema_version":"1.0","source":{"id":"2512.03109","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2512.03109","created_at":"2026-05-29T02:05:38Z"},{"alias_kind":"arxiv_version","alias_value":"2512.03109v2","created_at":"2026-05-29T02:05:38Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.03109","created_at":"2026-05-29T02:05:38Z"},{"alias_kind":"pith_short_12","alias_value":"F7ZF6IGC6JAZ","created_at":"2026-05-29T02:05:38Z"},{"alias_kind":"pith_short_16","alias_value":"F7ZF6IGC6JAZERSP","created_at":"2026-05-29T02:05:38Z"},{"alias_kind":"pith_short_8","alias_value":"F7ZF6IGC","created_at":"2026-05-29T02:05:38Z"}],"graph_snapshots":[{"event_id":"sha256:66ae12b6e3c42a6d7dd78a70b6ef52aeeaf514ed82cae93062c9baa9b0940ea0","target":"graph","created_at":"2026-05-29T02:05:38Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2512.03109/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Agentic AI systems execute a sequence of actions, such as reasoning steps or tool calls, in response to a user prompt. To evaluate the success of their trajectories, researchers have developed verifiers, such as LLM judges and process-reward models, to score the quality of each action in an agent's trajectory. Although these heuristic scores can be informative, there are no guarantees of correctness when used to decide whether an agent will yield a successful output. Here, we introduce e-valuator, a method to convert any black-box verifier score into a decision rule with provable control of fa","authors_text":"Aviv Regev, Bonnie Berger, Clara Fannjiang, Drew Prinster, Gabriele Scalia, Hanchen Wang, Shuvom Sadhuka","cross_cats":["cs.AI","stat.AP","stat.ML"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-12-02T05:59:18Z","title":"E-valuator: Reliable Agent Verifiers with Sequential Hypothesis Testing"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2512.03109","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:1bc168869f85acdd00a45d8837fd95e5b5a467c859f21ffff2b30e900b1ef152","target":"record","created_at":"2026-05-29T02:05:38Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"3849567f6c1031e6853bd81d3daa2a59ba7f9e6980031ea6d4b1756c3d05482e","cross_cats_sorted":["cs.AI","stat.AP","stat.ML"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-12-02T05:59:18Z","title_canon_sha256":"800d60c597251a084f57b365597086af314f58ee569f06d2c8004c76a0efbe66"},"schema_version":"1.0","source":{"id":"2512.03109","kind":"arxiv","version":2}},"canonical_sha256":"2ff25f20c2f24192464f9cf52b3a763d4597f4648a4e1bad1e6720e1f11a772d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"2ff25f20c2f24192464f9cf52b3a763d4597f4648a4e1bad1e6720e1f11a772d","first_computed_at":"2026-05-29T02:05:38.137386Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-29T02:05:38.137386Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"YMic5F8+XhVGmVnQXjrBlQLIgaC2gqIoLCU+JPBexakb8OsAjwrIUWhPATP0YM3915WWqtipg0CLUtKjlx7XDA==","signature_status":"signed_v1","signed_at":"2026-05-29T02:05:38.138062Z","signed_message":"canonical_sha256_bytes"},"source_id":"2512.03109","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:1bc168869f85acdd00a45d8837fd95e5b5a467c859f21ffff2b30e900b1ef152","sha256:66ae12b6e3c42a6d7dd78a70b6ef52aeeaf514ed82cae93062c9baa9b0940ea0"],"state_sha256":"c6cc44df66be9cff45be960e515197211359762ec36fc0149e3c4ae2181ddc04"}