{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:QBDA5KU5FNO4N6WU6RLKYU2MCG","short_pith_number":"pith:QBDA5KU5","schema_version":"1.0","canonical_sha256":"80460eaa9d2b5dc6fad4f456ac534c11b774a5e36f3f4492dcc77d41affae5f3","source":{"kind":"arxiv","id":"2603.20088","version":2},"attestation_state":"computed","paper":{"title":"Towards an Evaluation Methodology for AI in Second Language Education: Lessons Learned from Developing L2-Bench","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CY","authors_text":"Ben Knight, Danielle Carvalho, Elizabeth Wonnacott, Isaac Pattis, James Edgell, Wm. Matthew Kennedy","submitted_at":"2026-03-20T16:13:03Z","abstract_excerpt":"The rapid adoption of large language models in AI-powered language education has created an urgent need for evaluations that assess pedagogical effectiveness, particularly in language learning--one of the most common LLM use cases (Tamkin et al. 2024; Costa-Gomes et al. 2025). With only narrowly defined task-specific evaluations of AI system capabilities in second language (L2) education existing in the literature, we require more holistic approaches in this AI for education space. To address this gap, we describe the iteration of the methodology we developed to build L2-Bench, a novel, contex"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2603.20088","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CY","submitted_at":"2026-03-20T16:13:03Z","cross_cats_sorted":[],"title_canon_sha256":"fe16bcdc9387b9c702d7f0ffc8741acf0135f5625a1583c2cb1458bdb8dfd023","abstract_canon_sha256":"cd0a8419f3a6681176846344a39160060c9c9372cab523556eeab689249efafc"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-25T02:02:14.517394Z","signature_b64":"hcDzkXD0am5DWVppsEsdCNNoiDueyMPplox4x7Ynqlth458K2EoC0aLBd5FlzqYnaTQV4SKZFLYixemiqAgxAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"80460eaa9d2b5dc6fad4f456ac534c11b774a5e36f3f4492dcc77d41affae5f3","last_reissued_at":"2026-05-25T02:02:14.516713Z","signature_status":"signed_v1","first_computed_at":"2026-05-25T02:02:14.516713Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Towards an Evaluation Methodology for AI in Second Language Education: Lessons Learned from Developing L2-Bench","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CY","authors_text":"Ben Knight, Danielle Carvalho, Elizabeth Wonnacott, Isaac Pattis, James Edgell, Wm. Matthew Kennedy","submitted_at":"2026-03-20T16:13:03Z","abstract_excerpt":"The rapid adoption of large language models in AI-powered language education has created an urgent need for evaluations that assess pedagogical effectiveness, particularly in language learning--one of the most common LLM use cases (Tamkin et al. 2024; Costa-Gomes et al. 2025). With only narrowly defined task-specific evaluations of AI system capabilities in second language (L2) education existing in the literature, we require more holistic approaches in this AI for education space. To address this gap, we describe the iteration of the methodology we developed to build L2-Bench, a novel, contex"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.20088","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2603.20088/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2603.20088","created_at":"2026-05-25T02:02:14.516787+00:00"},{"alias_kind":"arxiv_version","alias_value":"2603.20088v2","created_at":"2026-05-25T02:02:14.516787+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.20088","created_at":"2026-05-25T02:02:14.516787+00:00"},{"alias_kind":"pith_short_12","alias_value":"QBDA5KU5FNO4","created_at":"2026-05-25T02:02:14.516787+00:00"},{"alias_kind":"pith_short_16","alias_value":"QBDA5KU5FNO4N6WU","created_at":"2026-05-25T02:02:14.516787+00:00"},{"alias_kind":"pith_short_8","alias_value":"QBDA5KU5","created_at":"2026-05-25T02:02:14.516787+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2604.26145","citing_title":"Ceci n'est pas une explication: Evaluating Explanation Failures as Explainability Pitfalls in Language Learning Systems","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26145","citing_title":"Ceci n'est pas une explication: Evaluating Explanation Failures as Explainability Pitfalls in Language Learning Systems","ref_index":7,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/QBDA5KU5FNO4N6WU6RLKYU2MCG","json":"https://pith.science/pith/QBDA5KU5FNO4N6WU6RLKYU2MCG.json","graph_json":"https://pith.science/api/pith-number/QBDA5KU5FNO4N6WU6RLKYU2MCG/graph.json","events_json":"https://pith.science/api/pith-number/QBDA5KU5FNO4N6WU6RLKYU2MCG/events.json","paper":"https://pith.science/paper/QBDA5KU5"},"agent_actions":{"view_html":"https://pith.science/pith/QBDA5KU5FNO4N6WU6RLKYU2MCG","download_json":"https://pith.science/pith/QBDA5KU5FNO4N6WU6RLKYU2MCG.json","view_paper":"https://pith.science/paper/QBDA5KU5","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2603.20088&json=true","fetch_graph":"https://pith.science/api/pith-number/QBDA5KU5FNO4N6WU6RLKYU2MCG/graph.json","fetch_events":"https://pith.science/api/pith-number/QBDA5KU5FNO4N6WU6RLKYU2MCG/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/QBDA5KU5FNO4N6WU6RLKYU2MCG/action/timestamp_anchor","attest_storage":"https://pith.science/pith/QBDA5KU5FNO4N6WU6RLKYU2MCG/action/storage_attestation","attest_author":"https://pith.science/pith/QBDA5KU5FNO4N6WU6RLKYU2MCG/action/author_attestation","sign_citation":"https://pith.science/pith/QBDA5KU5FNO4N6WU6RLKYU2MCG/action/citation_signature","submit_replication":"https://pith.science/pith/QBDA5KU5FNO4N6WU6RLKYU2MCG/action/replication_record"}},"created_at":"2026-05-25T02:02:14.516787+00:00","updated_at":"2026-05-25T02:02:14.516787+00:00"}