{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:LR3O5OWUVJTULQHONXTB3CUWTK","short_pith_number":"pith:LR3O5OWU","schema_version":"1.0","canonical_sha256":"5c76eebad4aa6745c0ee6de61d8a969a97877fecb9550fc7daca827793eb9dfb","source":{"kind":"arxiv","id":"2512.09506","version":5},"attestation_state":"computed","paper":{"title":"Beyond Knowledge to Agency: Evaluating Expertise, Autonomy, and Integrity in Finance with CNFinBench","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CE","authors_text":"Boyi Xiao, Chao Ding, Dawei Cheng, Jiayuan Chen, Jie Xu, Jinru Ding, Junming Guan, Tiantian Yuan, Wenrao Pang, Yidong Jiang, Yun Zhong, Zhiqiang Liu","submitted_at":"2025-12-10T10:30:00Z","abstract_excerpt":"As large language models (LLMs) become high-privilege agents in risk-sensitive settings, they introduce systemic threats beyond hallucination, where minor compliance errors can cause critical data leaks. However, existing benchmarks focus on rule-based QA, lacking agentic execution modeling, overlooking compliance drift in adversarial interactions, and relying on binary safety metrics that fail to capture behavioral degradation. To bridge these gaps, we present CNFinBench, a comprehensive benchmark spanning 29 subtasks grounded in the triad of expertise, autonomy, and integrity. It assesses do"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2512.09506","kind":"arxiv","version":5},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CE","submitted_at":"2025-12-10T10:30:00Z","cross_cats_sorted":[],"title_canon_sha256":"f3da992d94b9e8d5fe5922dc0239d6dcff53ace88a31b4a26ee8159a7426e3e7","abstract_canon_sha256":"a7256f8e9cdf658ca5d93e8eb1e9b41976aa968249414c1b74424f4160491821"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-01T01:02:27.020372Z","signature_b64":"C5qhwtVMP4DBzdwWfU0/K6uF0aMHj5U7j4waA/9hq1R/CD7iufoeSHOZU9o+Tjum6OCumXUdgOUB8rtj5K5JBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5c76eebad4aa6745c0ee6de61d8a969a97877fecb9550fc7daca827793eb9dfb","last_reissued_at":"2026-06-01T01:02:27.019495Z","signature_status":"signed_v1","first_computed_at":"2026-06-01T01:02:27.019495Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Beyond Knowledge to Agency: Evaluating Expertise, Autonomy, and Integrity in Finance with CNFinBench","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CE","authors_text":"Boyi Xiao, Chao Ding, Dawei Cheng, Jiayuan Chen, Jie Xu, Jinru Ding, Junming Guan, Tiantian Yuan, Wenrao Pang, Yidong Jiang, Yun Zhong, Zhiqiang Liu","submitted_at":"2025-12-10T10:30:00Z","abstract_excerpt":"As large language models (LLMs) become high-privilege agents in risk-sensitive settings, they introduce systemic threats beyond hallucination, where minor compliance errors can cause critical data leaks. However, existing benchmarks focus on rule-based QA, lacking agentic execution modeling, overlooking compliance drift in adversarial interactions, and relying on binary safety metrics that fail to capture behavioral degradation. To bridge these gaps, we present CNFinBench, a comprehensive benchmark spanning 29 subtasks grounded in the triad of expertise, autonomy, and integrity. It assesses do"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2512.09506","kind":"arxiv","version":5},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2512.09506/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2512.09506","created_at":"2026-06-01T01:02:27.019647+00:00"},{"alias_kind":"arxiv_version","alias_value":"2512.09506v5","created_at":"2026-06-01T01:02:27.019647+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.09506","created_at":"2026-06-01T01:02:27.019647+00:00"},{"alias_kind":"pith_short_12","alias_value":"LR3O5OWUVJTU","created_at":"2026-06-01T01:02:27.019647+00:00"},{"alias_kind":"pith_short_16","alias_value":"LR3O5OWUVJTULQHO","created_at":"2026-06-01T01:02:27.019647+00:00"},{"alias_kind":"pith_short_8","alias_value":"LR3O5OWU","created_at":"2026-06-01T01:02:27.019647+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"2605.20759","citing_title":"Rethinking Fraud Safety Evaluation: Multi-Round Attacks Reveal Safety-Utility Tradeoffs in Graph-Context LLM Defenders","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18025","citing_title":"TeleCom-Bench: How Far Are Large Language Models from Industrial Telecommunication Applications?","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17962","citing_title":"FinDocMRE: A Benchmark for Document-Level Financial Multimodal Reasoning Evaluation","ref_index":4,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/LR3O5OWUVJTULQHONXTB3CUWTK","json":"https://pith.science/pith/LR3O5OWUVJTULQHONXTB3CUWTK.json","graph_json":"https://pith.science/api/pith-number/LR3O5OWUVJTULQHONXTB3CUWTK/graph.json","events_json":"https://pith.science/api/pith-number/LR3O5OWUVJTULQHONXTB3CUWTK/events.json","paper":"https://pith.science/paper/LR3O5OWU"},"agent_actions":{"view_html":"https://pith.science/pith/LR3O5OWUVJTULQHONXTB3CUWTK","download_json":"https://pith.science/pith/LR3O5OWUVJTULQHONXTB3CUWTK.json","view_paper":"https://pith.science/paper/LR3O5OWU","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2512.09506&json=true","fetch_graph":"https://pith.science/api/pith-number/LR3O5OWUVJTULQHONXTB3CUWTK/graph.json","fetch_events":"https://pith.science/api/pith-number/LR3O5OWUVJTULQHONXTB3CUWTK/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/LR3O5OWUVJTULQHONXTB3CUWTK/action/timestamp_anchor","attest_storage":"https://pith.science/pith/LR3O5OWUVJTULQHONXTB3CUWTK/action/storage_attestation","attest_author":"https://pith.science/pith/LR3O5OWUVJTULQHONXTB3CUWTK/action/author_attestation","sign_citation":"https://pith.science/pith/LR3O5OWUVJTULQHONXTB3CUWTK/action/citation_signature","submit_replication":"https://pith.science/pith/LR3O5OWUVJTULQHONXTB3CUWTK/action/replication_record"}},"created_at":"2026-06-01T01:02:27.019647+00:00","updated_at":"2026-06-01T01:02:27.019647+00:00"}