{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:KDNPIUSLEQP45MT6QC3PZAX7XH","short_pith_number":"pith:KDNPIUSL","schema_version":"1.0","canonical_sha256":"50daf4524b241fceb27e80b6fc82ffb9c9f50420cb020478e52e0e5ec3c6e7f2","source":{"kind":"arxiv","id":"2606.04751","version":1},"attestation_state":"computed","paper":{"title":"FALSIFYBENCH: Evaluating Inductive Reasoning in LLMs with Rule Discovery Games","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Katya Tentori, Leonardo Bertolazzi, Raffaella Bernardi","submitted_at":"2026-06-03T11:33:17Z","abstract_excerpt":"Large language models (LLMs) are increasingly deployed as autonomous agents in scientific tasks. Yet whether these systems can effectively engage in forms of inductive reasoning relevant to scientific discovery remains an open question. In this work, we introduce FALSIFYBENCH, an evaluation framework for hypothesis-driven reasoning inspired by the classic Wason 2-4-6 task, in which agents must discover hidden semantic properties by iteratively proposing examples and receiving feedback. This task captures key elements of scientific reasoning: hypothesis generation, evidence gathering, and belie"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.04751","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-03T11:33:17Z","cross_cats_sorted":[],"title_canon_sha256":"b1d130ad5aaac4336069a46f3566930405f60c7d41963e9c3dab02e9c5500f55","abstract_canon_sha256":"e2068e766d7b5a4e6b62aabdec581478cfa3196c1027661b4e430d520aa32d54"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-04T01:09:28.484749Z","signature_b64":"EQqWsKLRdLYaoKVgCafg0LWEEF1+/RLTkyzfN/xWFJZAXsXJRudNjF93ZR1UuNmPVlP3Ogx9ezmwTaj4CUfkDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"50daf4524b241fceb27e80b6fc82ffb9c9f50420cb020478e52e0e5ec3c6e7f2","last_reissued_at":"2026-06-04T01:09:28.483823Z","signature_status":"signed_v1","first_computed_at":"2026-06-04T01:09:28.483823Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"FALSIFYBENCH: Evaluating Inductive Reasoning in LLMs with Rule Discovery Games","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Katya Tentori, Leonardo Bertolazzi, Raffaella Bernardi","submitted_at":"2026-06-03T11:33:17Z","abstract_excerpt":"Large language models (LLMs) are increasingly deployed as autonomous agents in scientific tasks. Yet whether these systems can effectively engage in forms of inductive reasoning relevant to scientific discovery remains an open question. In this work, we introduce FALSIFYBENCH, an evaluation framework for hypothesis-driven reasoning inspired by the classic Wason 2-4-6 task, in which agents must discover hidden semantic properties by iteratively proposing examples and receiving feedback. This task captures key elements of scientific reasoning: hypothesis generation, evidence gathering, and belie"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.04751","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.04751/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.04751","created_at":"2026-06-04T01:09:28.483949+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.04751v1","created_at":"2026-06-04T01:09:28.483949+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.04751","created_at":"2026-06-04T01:09:28.483949+00:00"},{"alias_kind":"pith_short_12","alias_value":"KDNPIUSLEQP4","created_at":"2026-06-04T01:09:28.483949+00:00"},{"alias_kind":"pith_short_16","alias_value":"KDNPIUSLEQP45MT6","created_at":"2026-06-04T01:09:28.483949+00:00"},{"alias_kind":"pith_short_8","alias_value":"KDNPIUSL","created_at":"2026-06-04T01:09:28.483949+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/KDNPIUSLEQP45MT6QC3PZAX7XH","json":"https://pith.science/pith/KDNPIUSLEQP45MT6QC3PZAX7XH.json","graph_json":"https://pith.science/api/pith-number/KDNPIUSLEQP45MT6QC3PZAX7XH/graph.json","events_json":"https://pith.science/api/pith-number/KDNPIUSLEQP45MT6QC3PZAX7XH/events.json","paper":"https://pith.science/paper/KDNPIUSL"},"agent_actions":{"view_html":"https://pith.science/pith/KDNPIUSLEQP45MT6QC3PZAX7XH","download_json":"https://pith.science/pith/KDNPIUSLEQP45MT6QC3PZAX7XH.json","view_paper":"https://pith.science/paper/KDNPIUSL","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.04751&json=true","fetch_graph":"https://pith.science/api/pith-number/KDNPIUSLEQP45MT6QC3PZAX7XH/graph.json","fetch_events":"https://pith.science/api/pith-number/KDNPIUSLEQP45MT6QC3PZAX7XH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/KDNPIUSLEQP45MT6QC3PZAX7XH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/KDNPIUSLEQP45MT6QC3PZAX7XH/action/storage_attestation","attest_author":"https://pith.science/pith/KDNPIUSLEQP45MT6QC3PZAX7XH/action/author_attestation","sign_citation":"https://pith.science/pith/KDNPIUSLEQP45MT6QC3PZAX7XH/action/citation_signature","submit_replication":"https://pith.science/pith/KDNPIUSLEQP45MT6QC3PZAX7XH/action/replication_record"}},"created_at":"2026-06-04T01:09:28.483949+00:00","updated_at":"2026-06-04T01:09:28.483949+00:00"}