{"paper":{"title":"FINESSE-Bench: A Hierarchical Benchmark Suite for Financial Domain Knowledge and Technical Analysis in Large Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"FINESSE-Bench supplies 3993 questions in eight levels to test financial knowledge progression in large language models.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Alexey Khoroshilov, Andrei Kalmykov, Denis Kokosinskii, Dmitry Stanishevskii, Dmitry Zmitrovich, Nini Kamkia, Zhirayr Hayrapetyan","submitted_at":"2026-05-14T23:53:51Z","abstract_excerpt":"Large language models (LLMs) are increasingly being applied to financial analysis, reporting, investment decision support, risk management, compliance, and professional training. However, robust evaluation of their domain competence in finance remains incomplete. Widely used open benchmarks such as FinQA, ConvFinQA, and TAT-QA have played an important role in advancing financial question answering and numerical reasoning, but they focus primarily on question answering over financial reports and do not provide an explicit hierarchy of professional difficulty. Broader resources, including Financ"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We present FINESSE-Bench, a suite of eight specialized benchmarks comprising 3,993 questions for hierarchical evaluation of financial competencies in LLMs.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The questions drawn from CFA-like, CMT-like, and CFTe-like certifications, plus trading tasks, form a valid and progressive hierarchy that accurately reflects real-world transitions from foundational to expert-level financial reasoning.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"FINESSE-Bench is a hierarchical benchmark suite of eight datasets with 3,993 questions for evaluating LLMs on financial domain knowledge, technical analysis, and professional competencies.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"FINESSE-Bench supplies 3993 questions in eight levels to test financial knowledge progression in large language models.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"f69d3d46e64080b3e0d363bf9c6d7e6b3ecb1323c096c0b64619342ebd84dff8"},"source":{"id":"2605.15482","kind":"arxiv","version":1},"verdict":{"id":"b786fd53-c5d4-4de7-aa41-ee0abad991c8","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T14:27:38.639245Z","strongest_claim":"We present FINESSE-Bench, a suite of eight specialized benchmarks comprising 3,993 questions for hierarchical evaluation of financial competencies in LLMs.","one_line_summary":"FINESSE-Bench is a hierarchical benchmark suite of eight datasets with 3,993 questions for evaluating LLMs on financial domain knowledge, technical analysis, and professional competencies.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The questions drawn from CFA-like, CMT-like, and CFTe-like certifications, plus trading tasks, form a valid and progressive hierarchy that accurately reflects real-world transitions from foundational to expert-level financial reasoning.","pith_extraction_headline":"FINESSE-Bench supplies 3993 questions in eight levels to test financial knowledge progression in large language models."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.15482/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"cited_work_retraction","ran_at":"2026-05-19T15:21:57.586185Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_title_agreement","ran_at":"2026-05-19T15:01:17.546125Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T14:37:38.314669Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"claim_evidence","ran_at":"2026-05-19T14:21:54.078703Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"citation_quote_validity","ran_at":"2026-05-19T13:49:41.404653Z","status":"skipped","version":"0.1.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T13:33:22.654441Z","status":"skipped","version":"1.0.0","findings_count":0}],"snapshot_sha256":"2177c5bf70ad127ba4949ce57ae7f4ecc44c0a0659bbf9fb4edadaad1db7cd81"},"references":{"count":15,"sample":[{"doi":"","year":2021,"title":"arXiv preprint arXiv:2109.00122 , year=","work_id":"a11b4f4b-d693-4a54-b14a-49094201d947","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Finqa: A dataset of numerical reasoning over financial data","work_id":"36c74fad-5676-4ef0-bbc7-fb4f7022c0da","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"arXiv preprint arXiv:2105.07624 , year =","work_id":"1c86ee16-bad3-4e2b-9df5-bdbfa9ddb687","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"FinanceBench: A New Benchmark for Financial Question Answering","work_id":"b60d115e-50dd-42f6-9178-5d29b05e1e89","ref_index":4,"cited_arxiv_id":"2311.11944","is_internal_anchor":true},{"doi":"","year":2023,"title":"Pixiu: A large language model, instruction data and evaluation benchmark for finance.arXiv preprint arXiv:2306.05443","work_id":"3ce38595-263a-4781-b28b-556a347b8eea","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":15,"snapshot_sha256":"7373230313ea0c61f3cbaba6784862e31cd5f0d388f3af7605f38f833b608f77","internal_anchors":5},"formal_canon":{"evidence_count":2,"snapshot_sha256":"104bf0b3f8c733858d41a85075ee80d134e176b73410e9638f4be4a1ef44c507"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"}