{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:R5MPPVO6UYQITCVRWN65DUXVSS","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"62acc9da46ef10fd8f8337152667d23794d8d3aaaf85bd54abd2ab4057369a2d","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T17:59:52Z","title_canon_sha256":"fabe073c790c2c92566cbfcb7de748b9fa4159520877aedf897b618b261c0727"},"schema_version":"1.0","source":{"id":"2605.13846","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13846","created_at":"2026-05-18T02:44:09Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13846v1","created_at":"2026-05-18T02:44:09Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13846","created_at":"2026-05-18T02:44:09Z"},{"alias_kind":"pith_short_12","alias_value":"R5MPPVO6UYQI","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"R5MPPVO6UYQITCVR","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"R5MPPVO6","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:98bc6ba2038f5833c17142f4dc39aa3c583910dae0b024f6ad51cd78024431c1","target":"graph","created_at":"2026-05-18T02:44:09Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Using a mere 6 hours of annotated data, WARDEN outperforms larger open-source and proprietary models and establishes a strong baseline."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the two-stage pipeline with Sundanese initialization and dictionary-guided LLM reasoning will reliably outperform unified models in extremely low-data regimes without overfitting or domain mismatch."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"WARDEN achieves better transcription and translation for Wardaman than larger models by separating the tasks and using Sundanese initialization plus a domain dictionary with just 6 hours of data."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A two-stage pipeline transcribes and translates the endangered Wardaman language using only six hours of annotated data."}],"snapshot_sha256":"893b320c7c2ce5706d14ccf636b9a690724f927fbbaffd23e1b49184b075c62c"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"This paper introduces WARDEN, an early language model system capable of transcribing and translating Wardaman, an endangered Australian indigenous language into English. The significant challenge we face is the lack of large-scale training data: in fact, we only have 6 hours of annotated audio. Therefore, while it is common practice to train a single model for transcription and translation using large datasets (like English to French), this practice is no longer viable in the Wardaman to English context. To tackle the low-resource challenge, we design WARDEN to have separate transcription and ","authors_text":"Liang Zheng, Naijing Liu, Yunzhong Hou, Ziheng Zhang","cross_cats":["cs.AI"],"headline":"A two-stage pipeline transcribes and translates the endangered Wardaman language using only six hours of annotated data.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T17:59:52Z","title":"WARDEN: Endangered Indigenous Language Transcription and Translation with 6 Hours of Training Data"},"references":{"count":23,"internal_anchors":2,"resolved_work":23,"sample":[{"cited_arxiv_id":"2605.13846","doi":"","is_internal_anchor":true,"ref_index":1,"title":"WARDEN: Endangered Indigenous Language Transcription and Translation with 6 Hours of Training Data","work_id":"5f8ca83e-607b-4ca8-9cfe-f32266e93be0","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Related Work Recent studies have shown that fine-tuning models like Whis- per for translation and transcription require a significant amount of data even when dealing with low-resource languages. Liu ","work_id":"3410da38-280e-4de6-af6e-9a52e9b63152","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"1, the proposed W ARDEN system is com- posed of two separate stages: a transcription stage and a trans- lation stage","work_id":"511d9ff0-f732-453b-82f7-ecfa97d82393","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Transcription:{transcription}. Lexicon en- tries:{lexicon entries}","work_id":"24452fe1-3cf2-4497-b436-de1962e4616d","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Dataset Data in this paper comes from a long-term anthropological lin- guistic documentation project on the Wardaman language from 1976 to 2025","work_id":"bc7d3858-fe66-47ce-b3e3-951e9a008fec","year":1976}],"snapshot_sha256":"b9f035985a28e461a836e8e78396f9a659239f39b313b9385844bb4e99cc09c0"},"source":{"id":"2605.13846","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T18:51:46.071355Z","id":"7ef8bec4-fa91-478f-a240-99af85c467b5","model_set":{"reader":"grok-4.3"},"one_line_summary":"WARDEN achieves better transcription and translation for Wardaman than larger models by separating the tasks and using Sundanese initialization plus a domain dictionary with just 6 hours of data.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A two-stage pipeline transcribes and translates the endangered Wardaman language using only six hours of annotated data.","strongest_claim":"Using a mere 6 hours of annotated data, WARDEN outperforms larger open-source and proprietary models and establishes a strong baseline.","weakest_assumption":"That the two-stage pipeline with Sundanese initialization and dictionary-guided LLM reasoning will reliably outperform unified models in extremely low-data regimes without overfitting or domain mismatch."}},"verdict_id":"7ef8bec4-fa91-478f-a240-99af85c467b5"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:823847afc1ea3b1189be3f29cba0d3d67443e63a21c3aab10fd0e30cb699f3a9","target":"record","created_at":"2026-05-18T02:44:09Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"62acc9da46ef10fd8f8337152667d23794d8d3aaaf85bd54abd2ab4057369a2d","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T17:59:52Z","title_canon_sha256":"fabe073c790c2c92566cbfcb7de748b9fa4159520877aedf897b618b261c0727"},"schema_version":"1.0","source":{"id":"2605.13846","kind":"arxiv","version":1}},"canonical_sha256":"8f58f7d5dea620898ab1b37dd1d2f59497c7c6ddbad36c604dfd4e6613f93dc2","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"8f58f7d5dea620898ab1b37dd1d2f59497c7c6ddbad36c604dfd4e6613f93dc2","first_computed_at":"2026-05-18T02:44:09.347264Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:44:09.347264Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"Eeh8/L5gLTniuqYdFPdqiXlC10WyFuCv+7UPAVKjnqscDFx+z5plsEsgcCXp2B60iredUNuRedYLI4TZVxCsBw==","signature_status":"signed_v1","signed_at":"2026-05-18T02:44:09.347761Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13846","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:823847afc1ea3b1189be3f29cba0d3d67443e63a21c3aab10fd0e30cb699f3a9","sha256:98bc6ba2038f5833c17142f4dc39aa3c583910dae0b024f6ad51cd78024431c1"],"state_sha256":"6ff796642505b89e98c8690ad432516818e9fd7ee6ab34d17b219e23685dd56b"}