{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:LBJDTFRIKMY7ZLKJXY6H5FJJ5Q","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"a51eeb3e733a8badcd6c84c0b9c92164847460318a7ae9ea72aa13d87c7da7c4","cross_cats_sorted":["cs.CL","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-31T21:46:52Z","title_canon_sha256":"45bd874ac93029cb818d327fbfdb7d98da5f2df4d7374fa60348895a722215df"},"schema_version":"1.0","source":{"id":"2606.01462","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.01462","created_at":"2026-06-02T02:04:33Z"},{"alias_kind":"arxiv_version","alias_value":"2606.01462v1","created_at":"2026-06-02T02:04:33Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.01462","created_at":"2026-06-02T02:04:33Z"},{"alias_kind":"pith_short_12","alias_value":"LBJDTFRIKMY7","created_at":"2026-06-02T02:04:33Z"},{"alias_kind":"pith_short_16","alias_value":"LBJDTFRIKMY7ZLKJ","created_at":"2026-06-02T02:04:33Z"},{"alias_kind":"pith_short_8","alias_value":"LBJDTFRI","created_at":"2026-06-02T02:04:33Z"}],"graph_snapshots":[{"event_id":"sha256:659850f23ef163a6269ae24a14e6eea20caee5866781002bf33971468b3ba7a8","target":"graph","created_at":"2026-06-02T02:04:33Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.01462/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Studies of human reasoning have shown that people are typically stronger at evaluating reasoning than producing it from scratch. In contrast, large reasoning models (LRMs) are trained to excel at producing long chains of reasoning to solve complex problems. How then do LRMs perform at evaluating reasons? We investigate this with the Valid-Answer-Invalid-Reasoning (VAIR) dataset: math problems and solutions with trivial reasoning flaws but valid answers, designed to isolate reasoning evaluation from the confound of reasoning production. Unlike humans, who we find are only 6% worse at grading th","authors_text":"Armando Solar-Lezama, Mingzhong Sun, Tan Zhi-Xuan, Teresa Yeo","cross_cats":["cs.CL","cs.LG"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-31T21:46:52Z","title":"An Enigma of Artificial Reason: Investigating the Production-Evaluation Gap in Large Reasoning Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.01462","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:da1eceab1780d55d3039b3d8e16a3d4a62cd2df2ade44f8122a5758fb06977d3","target":"record","created_at":"2026-06-02T02:04:33Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"a51eeb3e733a8badcd6c84c0b9c92164847460318a7ae9ea72aa13d87c7da7c4","cross_cats_sorted":["cs.CL","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-31T21:46:52Z","title_canon_sha256":"45bd874ac93029cb818d327fbfdb7d98da5f2df4d7374fa60348895a722215df"},"schema_version":"1.0","source":{"id":"2606.01462","kind":"arxiv","version":1}},"canonical_sha256":"58523996285331fcad49be3c7e9529ec0ece565b8f9f806ceb2a92c74e823066","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"58523996285331fcad49be3c7e9529ec0ece565b8f9f806ceb2a92c74e823066","first_computed_at":"2026-06-02T02:04:33.661117Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-02T02:04:33.661117Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"GpvDRnByyiAp1gTmNAc3uWcYMs71rbZ0angxUE5DBYq8lLR1CNe3TwKBts18Iv42NUBgMqwKq1K7J+4YPNqqCQ==","signature_status":"signed_v1","signed_at":"2026-06-02T02:04:33.661521Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.01462","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:da1eceab1780d55d3039b3d8e16a3d4a62cd2df2ade44f8122a5758fb06977d3","sha256:659850f23ef163a6269ae24a14e6eea20caee5866781002bf33971468b3ba7a8"],"state_sha256":"b2c992025bcc1e50d0bfa01ef79916f85a56e29eca653b2776e29d0ee0183640"}