{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:QWIGJSQNTOJMAXZQWRDBTPSAPG","short_pith_number":"pith:QWIGJSQN","canonical_record":{"source":{"id":"2605.10379","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-11T11:23:36Z","cross_cats_sorted":[],"title_canon_sha256":"1b7f3f35c517047fae4eef1304aeec951765e7bb228216aeb258cf1aa64cb3bb","abstract_canon_sha256":"6d79407fe6c253eec7ab40bdb3729eefd4266fe0105ddaf41d61a35a0916aad9"},"schema_version":"1.0"},"canonical_sha256":"859064ca0d9b92c05f30b44619be4079ba784772670d01fe8648dabe83b92b4a","source":{"kind":"arxiv","id":"2605.10379","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.10379","created_at":"2026-06-26T01:15:53Z"},{"alias_kind":"arxiv_version","alias_value":"2605.10379v2","created_at":"2026-06-26T01:15:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.10379","created_at":"2026-06-26T01:15:53Z"},{"alias_kind":"pith_short_12","alias_value":"QWIGJSQNTOJM","created_at":"2026-06-26T01:15:53Z"},{"alias_kind":"pith_short_16","alias_value":"QWIGJSQNTOJMAXZQ","created_at":"2026-06-26T01:15:53Z"},{"alias_kind":"pith_short_8","alias_value":"QWIGJSQN","created_at":"2026-06-26T01:15:53Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:QWIGJSQNTOJMAXZQWRDBTPSAPG","target":"record","payload":{"canonical_record":{"source":{"id":"2605.10379","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-11T11:23:36Z","cross_cats_sorted":[],"title_canon_sha256":"1b7f3f35c517047fae4eef1304aeec951765e7bb228216aeb258cf1aa64cb3bb","abstract_canon_sha256":"6d79407fe6c253eec7ab40bdb3729eefd4266fe0105ddaf41d61a35a0916aad9"},"schema_version":"1.0"},"canonical_sha256":"859064ca0d9b92c05f30b44619be4079ba784772670d01fe8648dabe83b92b4a","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-26T01:15:53.112162Z","signature_b64":"a0keJNPp66I3N3sN+TaDNeW3bEkuVYPhsmxHmxpM+fHzAAsjk6qULdXI8VfK/LDSvRQ3QRjiHItKMoJ7PPu1Dg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"859064ca0d9b92c05f30b44619be4079ba784772670d01fe8648dabe83b92b4a","last_reissued_at":"2026-06-26T01:15:53.111753Z","signature_status":"signed_v1","first_computed_at":"2026-06-26T01:15:53.111753Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.10379","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-26T01:15:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"8eBGoyInadA4Hi8yIv8McX2IQBQ+OIazBE/1Eq54wM+JiAzkt7BFhFMQMMvezws5ubUt01KymyeGM3sbwNw8Bg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-30T23:25:11.129066Z"},"content_sha256":"b109f0a5acee98bed8d72b8516e78a2237801c809c98b4698934cad3223938b0","schema_version":"1.0","event_id":"sha256:b109f0a5acee98bed8d72b8516e78a2237801c809c98b4698934cad3223938b0"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:QWIGJSQNTOJMAXZQWRDBTPSAPG","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Not All Proofs Are Equal: Evaluating LLM Proof Quality Beyond Correctness","license":"http://creativecommons.org/licenses/by/4.0/","headline":"LLM-generated proofs differ substantially in quality beyond mere correctness, with measurable trade-offs.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Dimitar I. Dimitrov, Ivo Petrov, Jasper Dekoninck, Martin Vechev","submitted_at":"2026-05-11T11:23:36Z","abstract_excerpt":"Large language models (LLMs) have become capable mathematical problem-solvers, often producing correct proofs for challenging problems. However, correctness alone is not sufficient: mathematical proofs should also be clear, concise, insightful, and transferable to other problems. While this proof quality is subjective and depends on the reader and context, many of its components are concrete and broadly valued. In this work, we identify such components and introduce ProofRank, a benchmark curated from challenging mathematical competitions. ProofRank evaluates several scalable proxies of proof "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Across models, we find substantial differences in proof quality that are not captured by correctness-only benchmarks. We also observe significant trade-offs between proof-quality metrics and correctness, suggesting that future evaluations of mathematical reasoning should measure how useful LLM-generated proofs are.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the five chosen proxies (conciseness, computational ease, cognitive simplicity, diversity, adaptivity) are scalable, objective enough to measure automatically, and genuinely reflect the aspects of proof quality that matter to human readers and downstream use.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"LLM proofs for hard math problems show large differences in quality metrics like conciseness and cognitive simplicity that correctness-only tests miss, along with trade-offs between quality and correctness.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"LLM-generated proofs differ substantially in quality beyond mere correctness, with measurable trade-offs.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"bb2d637fd165b0f460a284f84d20a556ba426d2cdef0cfebebd2a8110f740454"},"source":{"id":"2605.10379","kind":"arxiv","version":2},"verdict":{"id":"c3e759a0-c599-4b55-bb4e-59a1d2448fb3","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-12T04:43:51.181080Z","strongest_claim":"Across models, we find substantial differences in proof quality that are not captured by correctness-only benchmarks. We also observe significant trade-offs between proof-quality metrics and correctness, suggesting that future evaluations of mathematical reasoning should measure how useful LLM-generated proofs are.","one_line_summary":"LLM proofs for hard math problems show large differences in quality metrics like conciseness and cognitive simplicity that correctness-only tests miss, along with trade-offs between quality and correctness.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the five chosen proxies (conciseness, computational ease, cognitive simplicity, diversity, adaptivity) are scalable, objective enough to measure automatically, and genuinely reflect the aspects of proof quality that matter to human readers and downstream use.","pith_extraction_headline":"LLM-generated proofs differ substantially in quality beyond mere correctness, with measurable trade-offs."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.10379/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"claim_evidence","ran_at":"2026-05-20T06:02:01.075682Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T15:34:39.406634Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_title_agreement","ran_at":"2026-05-19T11:31:18.383173Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T09:24:24.401000Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"e4268cf0d11a2d95b1b3eddaddac672f610723bd4da9821c000b85f319b203f5"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f373ba525dbe75bdf27f3f50e18c42f59228858334d0dd8d717aa9a92cd29f02"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"c3e759a0-c599-4b55-bb4e-59a1d2448fb3"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-26T01:15:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"65uLA1GgWxbNnTBmHL2k8I+SQa/Az1hZsqtO4OJ3FmLx46l/7VDpuYM+4/h/sRqACxgbd2vnxHooQu3hzx0oBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-30T23:25:11.129588Z"},"content_sha256":"ea204ac7550c89bf39049b52686f706649304e0a0350ced7acdfbfd2b3cdacc5","schema_version":"1.0","event_id":"sha256:ea204ac7550c89bf39049b52686f706649304e0a0350ced7acdfbfd2b3cdacc5"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/QWIGJSQNTOJMAXZQWRDBTPSAPG/bundle.json","state_url":"https://pith.science/pith/QWIGJSQNTOJMAXZQWRDBTPSAPG/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/QWIGJSQNTOJMAXZQWRDBTPSAPG/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-30T23:25:11Z","links":{"resolver":"https://pith.science/pith/QWIGJSQNTOJMAXZQWRDBTPSAPG","bundle":"https://pith.science/pith/QWIGJSQNTOJMAXZQWRDBTPSAPG/bundle.json","state":"https://pith.science/pith/QWIGJSQNTOJMAXZQWRDBTPSAPG/state.json","well_known_bundle":"https://pith.science/.well-known/pith/QWIGJSQNTOJMAXZQWRDBTPSAPG/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:QWIGJSQNTOJMAXZQWRDBTPSAPG","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"6d79407fe6c253eec7ab40bdb3729eefd4266fe0105ddaf41d61a35a0916aad9","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-11T11:23:36Z","title_canon_sha256":"1b7f3f35c517047fae4eef1304aeec951765e7bb228216aeb258cf1aa64cb3bb"},"schema_version":"1.0","source":{"id":"2605.10379","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.10379","created_at":"2026-06-26T01:15:53Z"},{"alias_kind":"arxiv_version","alias_value":"2605.10379v2","created_at":"2026-06-26T01:15:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.10379","created_at":"2026-06-26T01:15:53Z"},{"alias_kind":"pith_short_12","alias_value":"QWIGJSQNTOJM","created_at":"2026-06-26T01:15:53Z"},{"alias_kind":"pith_short_16","alias_value":"QWIGJSQNTOJMAXZQ","created_at":"2026-06-26T01:15:53Z"},{"alias_kind":"pith_short_8","alias_value":"QWIGJSQN","created_at":"2026-06-26T01:15:53Z"}],"graph_snapshots":[{"event_id":"sha256:ea204ac7550c89bf39049b52686f706649304e0a0350ced7acdfbfd2b3cdacc5","target":"graph","created_at":"2026-06-26T01:15:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Across models, we find substantial differences in proof quality that are not captured by correctness-only benchmarks. We also observe significant trade-offs between proof-quality metrics and correctness, suggesting that future evaluations of mathematical reasoning should measure how useful LLM-generated proofs are."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the five chosen proxies (conciseness, computational ease, cognitive simplicity, diversity, adaptivity) are scalable, objective enough to measure automatically, and genuinely reflect the aspects of proof quality that matter to human readers and downstream use."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"LLM proofs for hard math problems show large differences in quality metrics like conciseness and cognitive simplicity that correctness-only tests miss, along with trade-offs between quality and correctness."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"LLM-generated proofs differ substantially in quality beyond mere correctness, with measurable trade-offs."}],"snapshot_sha256":"bb2d637fd165b0f460a284f84d20a556ba426d2cdef0cfebebd2a8110f740454"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f373ba525dbe75bdf27f3f50e18c42f59228858334d0dd8d717aa9a92cd29f02"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"claim_evidence","ran_at":"2026-05-20T06:02:01.075682Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-19T15:34:39.406634Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T11:31:18.383173Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T09:24:24.401000Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2605.10379/integrity.json","findings":[],"snapshot_sha256":"e4268cf0d11a2d95b1b3eddaddac672f610723bd4da9821c000b85f319b203f5","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Large language models (LLMs) have become capable mathematical problem-solvers, often producing correct proofs for challenging problems. However, correctness alone is not sufficient: mathematical proofs should also be clear, concise, insightful, and transferable to other problems. While this proof quality is subjective and depends on the reader and context, many of its components are concrete and broadly valued. In this work, we identify such components and introduce ProofRank, a benchmark curated from challenging mathematical competitions. ProofRank evaluates several scalable proxies of proof ","authors_text":"Dimitar I. Dimitrov, Ivo Petrov, Jasper Dekoninck, Martin Vechev","cross_cats":[],"headline":"LLM-generated proofs differ substantially in quality beyond mere correctness, with measurable trade-offs.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-11T11:23:36Z","title":"Not All Proofs Are Equal: Evaluating LLM Proof Quality Beyond Correctness"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.10379","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-12T04:43:51.181080Z","id":"c3e759a0-c599-4b55-bb4e-59a1d2448fb3","model_set":{"reader":"grok-4.3"},"one_line_summary":"LLM proofs for hard math problems show large differences in quality metrics like conciseness and cognitive simplicity that correctness-only tests miss, along with trade-offs between quality and correctness.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"LLM-generated proofs differ substantially in quality beyond mere correctness, with measurable trade-offs.","strongest_claim":"Across models, we find substantial differences in proof quality that are not captured by correctness-only benchmarks. We also observe significant trade-offs between proof-quality metrics and correctness, suggesting that future evaluations of mathematical reasoning should measure how useful LLM-generated proofs are.","weakest_assumption":"That the five chosen proxies (conciseness, computational ease, cognitive simplicity, diversity, adaptivity) are scalable, objective enough to measure automatically, and genuinely reflect the aspects of proof quality that matter to human readers and downstream use."}},"verdict_id":"c3e759a0-c599-4b55-bb4e-59a1d2448fb3"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:b109f0a5acee98bed8d72b8516e78a2237801c809c98b4698934cad3223938b0","target":"record","created_at":"2026-06-26T01:15:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"6d79407fe6c253eec7ab40bdb3729eefd4266fe0105ddaf41d61a35a0916aad9","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-11T11:23:36Z","title_canon_sha256":"1b7f3f35c517047fae4eef1304aeec951765e7bb228216aeb258cf1aa64cb3bb"},"schema_version":"1.0","source":{"id":"2605.10379","kind":"arxiv","version":2}},"canonical_sha256":"859064ca0d9b92c05f30b44619be4079ba784772670d01fe8648dabe83b92b4a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"859064ca0d9b92c05f30b44619be4079ba784772670d01fe8648dabe83b92b4a","first_computed_at":"2026-06-26T01:15:53.111753Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-26T01:15:53.111753Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"a0keJNPp66I3N3sN+TaDNeW3bEkuVYPhsmxHmxpM+fHzAAsjk6qULdXI8VfK/LDSvRQ3QRjiHItKMoJ7PPu1Dg==","signature_status":"signed_v1","signed_at":"2026-06-26T01:15:53.112162Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.10379","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:b109f0a5acee98bed8d72b8516e78a2237801c809c98b4698934cad3223938b0","sha256:ea204ac7550c89bf39049b52686f706649304e0a0350ced7acdfbfd2b3cdacc5"],"state_sha256":"2c01d3e577bdb7d1548a41e3e9d40b00aea99530d46807f27368eeca0f5d4bbf"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"iw9HqAB9xs/+6YQ1U1MQzqNSbUK8n5V5I9tlD9FENdO4SFs4iJw28iid61vpMXWGW8YEj+GSHyx8RPMK2MZACw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-30T23:25:11.131920Z","bundle_sha256":"02aaf7310a4495f4404a4fdfa200ba578da90a8ac7f662a2f3f9a2491fb99036"}}