{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:GTDMR7TSQU3WARLF2US7JDQWYO","short_pith_number":"pith:GTDMR7TS","canonical_record":{"source":{"id":"2604.25384","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-28T08:51:37Z","cross_cats_sorted":[],"title_canon_sha256":"b3211b2772f07294cb6d545de1715b11ca00518b8f6f133bac194114bd00fc8a","abstract_canon_sha256":"d27b87d2a1fcf8ca6d27a0c3918570e7d1bdad2c72cd160c7bd7391b2a6ea7ec"},"schema_version":"1.0"},"canonical_sha256":"34c6c8fe728537604565d525f48e16c3bf58e9dbef74b018869706bbd0b871e9","source":{"kind":"arxiv","id":"2604.25384","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.25384","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"arxiv_version","alias_value":"2604.25384v2","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.25384","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"pith_short_12","alias_value":"GTDMR7TSQU3W","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"pith_short_16","alias_value":"GTDMR7TSQU3WARLF","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"pith_short_8","alias_value":"GTDMR7TS","created_at":"2026-05-20T00:01:42Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:GTDMR7TSQU3WARLF2US7JDQWYO","target":"record","payload":{"canonical_record":{"source":{"id":"2604.25384","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-28T08:51:37Z","cross_cats_sorted":[],"title_canon_sha256":"b3211b2772f07294cb6d545de1715b11ca00518b8f6f133bac194114bd00fc8a","abstract_canon_sha256":"d27b87d2a1fcf8ca6d27a0c3918570e7d1bdad2c72cd160c7bd7391b2a6ea7ec"},"schema_version":"1.0"},"canonical_sha256":"34c6c8fe728537604565d525f48e16c3bf58e9dbef74b018869706bbd0b871e9","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:01:42.085194Z","signature_b64":"rxP6NRaCtt/wlf2CmS+FDjiowsAzyT+epcHH3fwp3Lxi88mdOCkSXOvuwXuE6SQrinEVXrRHOcgPB9r/POz6Dw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"34c6c8fe728537604565d525f48e16c3bf58e9dbef74b018869706bbd0b871e9","last_reissued_at":"2026-05-20T00:01:42.084522Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:01:42.084522Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2604.25384","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:01:42Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"yTgO1K529Xz2ozMRjoX3LQ26T9qEuUZt4RpyLHmboDJeUweddGfZX5cN13RVN6zCSDZlXyadE0dF11q6aH9/Bg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-26T21:24:49.880083Z"},"content_sha256":"df143262aa2869d9310639515099b67887037480f3c5f3d732fa8c45c244fac0","schema_version":"1.0","event_id":"sha256:df143262aa2869d9310639515099b67887037480f3c5f3d732fa8c45c244fac0"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:GTDMR7TSQU3WARLF2US7JDQWYO","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Wiki Dumps to Training Corpora: South Slavic Case","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A pipeline extracts and filters text from Wikimedia dumps to build clean corpora for seven South Slavic languages.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Cosimo Palma, Mihailo \\v{S}kori\\'c","submitted_at":"2026-04-28T08:51:37Z","abstract_excerpt":"This paper presents a pipeline designed to transform raw Wikimedia dumps into quality textual corpora for seven South Slavic languages. The work is divided into two major phases. The first involves extracting and cleaning text from raw dumps of Wikipedia, Wikisource, Wikibooks, Wikinews, and Wikiquote. This step requires careful handling of raw wiki markup to isolate, first of all, textual articles, and then usable natural language text within them. The second phase addresses the challenge of questionable or low-quality articles, which are often generated from databases or structured knowledge"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"The n-gram-based filtering strategy detects high levels of textual redundancy between articles and removes such low-quality articles from the corpora entirely, yielding linguistically rich texts suitable for language model training.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That repetitive n-gram patterns reliably mark low-quality database-generated articles while preserving original, high-information content across the seven languages.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"A two-phase pipeline extracts clean text from Wikimedia dumps and applies n-gram filtering to remove repetitive low-quality articles for South Slavic language corpora.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A pipeline extracts and filters text from Wikimedia dumps to build clean corpora for seven South Slavic languages.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"cb841cea50991a54d0963929a53a94bd5bc3305dd6f8b81be262b3ba1d341246"},"source":{"id":"2604.25384","kind":"arxiv","version":2},"verdict":{"id":"f1cdaf69-180c-4b8b-9ff7-019d7a7779d0","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T18:01:08.836670Z","strongest_claim":"The n-gram-based filtering strategy detects high levels of textual redundancy between articles and removes such low-quality articles from the corpora entirely, yielding linguistically rich texts suitable for language model training.","one_line_summary":"A two-phase pipeline extracts clean text from Wikimedia dumps and applies n-gram filtering to remove repetitive low-quality articles for South Slavic language corpora.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That repetitive n-gram patterns reliably mark low-quality database-generated articles while preserving original, high-information content across the seven languages.","pith_extraction_headline":"A pipeline extracts and filters text from Wikimedia dumps to build clean corpora for seven South Slavic languages."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.25384/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"doi_compliance","ran_at":"2026-05-19T21:12:27.301709Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"01d78f3b454bdf2aab59e04e60a7a6515983468318b517d2baa8056c6efe5eec"},"references":{"count":44,"sample":[{"doi":"","year":2026,"title":"Wiki Dumps to Training Corpora: South Slavic Case","work_id":"cc849718-c54c-4ae4-949c-a68a14a5f699","ref_index":1,"cited_arxiv_id":"2604.25384","is_internal_anchor":true},{"doi":"","year":2026,"title":"(version code 20260401)","work_id":"6907d799-d02c-4b5b-b91a-1f338db4dac2","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"T ext extraction Once the raw dumps are converted into line‑oriented JSON (JSONL) files, each page is processed in batches to extract usable text and metadata","work_id":"140ebf1c-292e-4a55-8d0d-c4e83f23296f","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Initial cleaning and parsing : Applies first regex pass to reduce markup noise and parses the text into a structured representation using mwparserfromhell library","work_id":"835baed3-7688-4c39-a5b3-8949ff24fb01","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Category handling: Identifies and extract category tags into a separate variable, while also removing category markup from the text","work_id":"1942f1e0-9871-4dd8-9bf2-700b9daeb533","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":44,"snapshot_sha256":"df717d5455c719b2cb406efbcce6f3830c52924a6c213e327d616f5b5b3712ea","internal_anchors":1},"formal_canon":{"evidence_count":2,"snapshot_sha256":"6888482999593a40f41365476040a30cd28dd4fbfe1d5096b77dfb6cfaa8c7e9"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"f1cdaf69-180c-4b8b-9ff7-019d7a7779d0"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:01:42Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"xo9jqGxhKK0htZmu16mrA5b6sUHpRt+UiAmTlMwvprcYNglwcd7Im9QffgAJXFFdNz7PKEfqj99kv8FoW30JAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-26T21:24:49.880722Z"},"content_sha256":"c65a9d9a3ea51f9153cbcee0bbce029b35db59b532c81ccae60e8d6a979b975e","schema_version":"1.0","event_id":"sha256:c65a9d9a3ea51f9153cbcee0bbce029b35db59b532c81ccae60e8d6a979b975e"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/GTDMR7TSQU3WARLF2US7JDQWYO/bundle.json","state_url":"https://pith.science/pith/GTDMR7TSQU3WARLF2US7JDQWYO/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/GTDMR7TSQU3WARLF2US7JDQWYO/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-26T21:24:49Z","links":{"resolver":"https://pith.science/pith/GTDMR7TSQU3WARLF2US7JDQWYO","bundle":"https://pith.science/pith/GTDMR7TSQU3WARLF2US7JDQWYO/bundle.json","state":"https://pith.science/pith/GTDMR7TSQU3WARLF2US7JDQWYO/state.json","well_known_bundle":"https://pith.science/.well-known/pith/GTDMR7TSQU3WARLF2US7JDQWYO/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:GTDMR7TSQU3WARLF2US7JDQWYO","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"d27b87d2a1fcf8ca6d27a0c3918570e7d1bdad2c72cd160c7bd7391b2a6ea7ec","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-28T08:51:37Z","title_canon_sha256":"b3211b2772f07294cb6d545de1715b11ca00518b8f6f133bac194114bd00fc8a"},"schema_version":"1.0","source":{"id":"2604.25384","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.25384","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"arxiv_version","alias_value":"2604.25384v2","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.25384","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"pith_short_12","alias_value":"GTDMR7TSQU3W","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"pith_short_16","alias_value":"GTDMR7TSQU3WARLF","created_at":"2026-05-20T00:01:42Z"},{"alias_kind":"pith_short_8","alias_value":"GTDMR7TS","created_at":"2026-05-20T00:01:42Z"}],"graph_snapshots":[{"event_id":"sha256:c65a9d9a3ea51f9153cbcee0bbce029b35db59b532c81ccae60e8d6a979b975e","target":"graph","created_at":"2026-05-20T00:01:42Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"The n-gram-based filtering strategy detects high levels of textual redundancy between articles and removes such low-quality articles from the corpora entirely, yielding linguistically rich texts suitable for language model training."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That repetitive n-gram patterns reliably mark low-quality database-generated articles while preserving original, high-information content across the seven languages."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"A two-phase pipeline extracts clean text from Wikimedia dumps and applies n-gram filtering to remove repetitive low-quality articles for South Slavic language corpora."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A pipeline extracts and filters text from Wikimedia dumps to build clean corpora for seven South Slavic languages."}],"snapshot_sha256":"cb841cea50991a54d0963929a53a94bd5bc3305dd6f8b81be262b3ba1d341246"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"6888482999593a40f41365476040a30cd28dd4fbfe1d5096b77dfb6cfaa8c7e9"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T21:12:27.301709Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2604.25384/integrity.json","findings":[],"snapshot_sha256":"01d78f3b454bdf2aab59e04e60a7a6515983468318b517d2baa8056c6efe5eec","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"This paper presents a pipeline designed to transform raw Wikimedia dumps into quality textual corpora for seven South Slavic languages. The work is divided into two major phases. The first involves extracting and cleaning text from raw dumps of Wikipedia, Wikisource, Wikibooks, Wikinews, and Wikiquote. This step requires careful handling of raw wiki markup to isolate, first of all, textual articles, and then usable natural language text within them. The second phase addresses the challenge of questionable or low-quality articles, which are often generated from databases or structured knowledge","authors_text":"Cosimo Palma, Mihailo \\v{S}kori\\'c","cross_cats":[],"headline":"A pipeline extracts and filters text from Wikimedia dumps to build clean corpora for seven South Slavic languages.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-28T08:51:37Z","title":"Wiki Dumps to Training Corpora: South Slavic Case"},"references":{"count":44,"internal_anchors":1,"resolved_work":44,"sample":[{"cited_arxiv_id":"2604.25384","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Wiki Dumps to Training Corpora: South Slavic Case","work_id":"cc849718-c54c-4ae4-949c-a68a14a5f699","year":2026},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"(version code 20260401)","work_id":"6907d799-d02c-4b5b-b91a-1f338db4dac2","year":2026},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"T ext extraction Once the raw dumps are converted into line‑oriented JSON (JSONL) files, each page is processed in batches to extract usable text and metadata","work_id":"140ebf1c-292e-4a55-8d0d-c4e83f23296f","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Initial cleaning and parsing : Applies first regex pass to reduce markup noise and parses the text into a structured representation using mwparserfromhell library","work_id":"835baed3-7688-4c39-a5b3-8949ff24fb01","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Category handling: Identifies and extract category tags into a separate variable, while also removing category markup from the text","work_id":"1942f1e0-9871-4dd8-9bf2-700b9daeb533","year":null}],"snapshot_sha256":"df717d5455c719b2cb406efbcce6f3830c52924a6c213e327d616f5b5b3712ea"},"source":{"id":"2604.25384","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-19T18:01:08.836670Z","id":"f1cdaf69-180c-4b8b-9ff7-019d7a7779d0","model_set":{"reader":"grok-4.3"},"one_line_summary":"A two-phase pipeline extracts clean text from Wikimedia dumps and applies n-gram filtering to remove repetitive low-quality articles for South Slavic language corpora.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A pipeline extracts and filters text from Wikimedia dumps to build clean corpora for seven South Slavic languages.","strongest_claim":"The n-gram-based filtering strategy detects high levels of textual redundancy between articles and removes such low-quality articles from the corpora entirely, yielding linguistically rich texts suitable for language model training.","weakest_assumption":"That repetitive n-gram patterns reliably mark low-quality database-generated articles while preserving original, high-information content across the seven languages."}},"verdict_id":"f1cdaf69-180c-4b8b-9ff7-019d7a7779d0"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:df143262aa2869d9310639515099b67887037480f3c5f3d732fa8c45c244fac0","target":"record","created_at":"2026-05-20T00:01:42Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"d27b87d2a1fcf8ca6d27a0c3918570e7d1bdad2c72cd160c7bd7391b2a6ea7ec","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-28T08:51:37Z","title_canon_sha256":"b3211b2772f07294cb6d545de1715b11ca00518b8f6f133bac194114bd00fc8a"},"schema_version":"1.0","source":{"id":"2604.25384","kind":"arxiv","version":2}},"canonical_sha256":"34c6c8fe728537604565d525f48e16c3bf58e9dbef74b018869706bbd0b871e9","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"34c6c8fe728537604565d525f48e16c3bf58e9dbef74b018869706bbd0b871e9","first_computed_at":"2026-05-20T00:01:42.084522Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:01:42.084522Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"rxP6NRaCtt/wlf2CmS+FDjiowsAzyT+epcHH3fwp3Lxi88mdOCkSXOvuwXuE6SQrinEVXrRHOcgPB9r/POz6Dw==","signature_status":"signed_v1","signed_at":"2026-05-20T00:01:42.085194Z","signed_message":"canonical_sha256_bytes"},"source_id":"2604.25384","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:df143262aa2869d9310639515099b67887037480f3c5f3d732fa8c45c244fac0","sha256:c65a9d9a3ea51f9153cbcee0bbce029b35db59b532c81ccae60e8d6a979b975e"],"state_sha256":"0683db27f1dba6be89736a327af552f6758e80bb62a78cb31ff044fa1cd079b8"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"WnpOncwY5tGH6XdDMvO9UGGjkM7ICx72wv/tXrMdrTp5hcYwCpjoOt6jvBjhPnQhETeUd5oNtmYRdqQlFkBKCQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-26T21:24:49.883238Z","bundle_sha256":"ab8cabcc015f2ad536548adb9843d4772a627328dd20419b4867e1af6c5cf14b"}}