{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:UTITDBCIZOOG6ELUTUPTGBCI6T","short_pith_number":"pith:UTITDBCI","schema_version":"1.0","canonical_sha256":"a4d1318448cb9c6f11749d1f330448f4cd1c36fa3a4f4792edf0992c5fe9cf22","source":{"kind":"arxiv","id":"1811.01910","version":2},"attestation_state":"computed","paper":{"title":"Evolutionary Data Measures: Understanding the Difficulty of Text Classification Tasks","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG","cs.NE"],"primary_cat":"cs.CL","authors_text":"Bingbing Zhang, Edward Collins, Nikolai Rozanov","submitted_at":"2018-11-05T18:39:54Z","abstract_excerpt":"Classification tasks are usually analysed and improved through new model architectures or hyperparameter optimisation but the underlying properties of datasets are discovered on an ad-hoc basis as errors occur. However, understanding the properties of the data is crucial in perfecting models. In this paper we analyse exactly which characteristics of a dataset best determine how difficult that dataset is for the task of text classification. We then propose an intuitive measure of difficulty for text classification datasets which is simple and fast to calculate. We show that this measure general"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1811.01910","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2018-11-05T18:39:54Z","cross_cats_sorted":["cs.AI","cs.LG","cs.NE"],"title_canon_sha256":"73daee9b537a8f0ca9fc155da528597a716c79cd8d848cfc5c5b6ca945cc4846","abstract_canon_sha256":"0fc69870ebadafac139941706fa495d009a5e683b206c78ee8240e15bf3f8df5"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:58:52.395157Z","signature_b64":"+qi585RR9LpM0X7MJwMXBdVdT2QgA9ECxhP0x5FAePOIrtAi5bMAv9TrlJfRm08cDC1Y5Ean5EO0hA0Vp3b5Bg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a4d1318448cb9c6f11749d1f330448f4cd1c36fa3a4f4792edf0992c5fe9cf22","last_reissued_at":"2026-05-17T23:58:52.394655Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:58:52.394655Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Evolutionary Data Measures: Understanding the Difficulty of Text Classification Tasks","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG","cs.NE"],"primary_cat":"cs.CL","authors_text":"Bingbing Zhang, Edward Collins, Nikolai Rozanov","submitted_at":"2018-11-05T18:39:54Z","abstract_excerpt":"Classification tasks are usually analysed and improved through new model architectures or hyperparameter optimisation but the underlying properties of datasets are discovered on an ad-hoc basis as errors occur. However, understanding the properties of the data is crucial in perfecting models. In this paper we analyse exactly which characteristics of a dataset best determine how difficult that dataset is for the task of text classification. We then propose an intuitive measure of difficulty for text classification datasets which is simple and fast to calculate. We show that this measure general"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1811.01910","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1811.01910","created_at":"2026-05-17T23:58:52.394733+00:00"},{"alias_kind":"arxiv_version","alias_value":"1811.01910v2","created_at":"2026-05-17T23:58:52.394733+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1811.01910","created_at":"2026-05-17T23:58:52.394733+00:00"},{"alias_kind":"pith_short_12","alias_value":"UTITDBCIZOOG","created_at":"2026-05-18T12:32:56.356000+00:00"},{"alias_kind":"pith_short_16","alias_value":"UTITDBCIZOOG6ELU","created_at":"2026-05-18T12:32:56.356000+00:00"},{"alias_kind":"pith_short_8","alias_value":"UTITDBCI","created_at":"2026-05-18T12:32:56.356000+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/UTITDBCIZOOG6ELUTUPTGBCI6T","json":"https://pith.science/pith/UTITDBCIZOOG6ELUTUPTGBCI6T.json","graph_json":"https://pith.science/api/pith-number/UTITDBCIZOOG6ELUTUPTGBCI6T/graph.json","events_json":"https://pith.science/api/pith-number/UTITDBCIZOOG6ELUTUPTGBCI6T/events.json","paper":"https://pith.science/paper/UTITDBCI"},"agent_actions":{"view_html":"https://pith.science/pith/UTITDBCIZOOG6ELUTUPTGBCI6T","download_json":"https://pith.science/pith/UTITDBCIZOOG6ELUTUPTGBCI6T.json","view_paper":"https://pith.science/paper/UTITDBCI","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1811.01910&json=true","fetch_graph":"https://pith.science/api/pith-number/UTITDBCIZOOG6ELUTUPTGBCI6T/graph.json","fetch_events":"https://pith.science/api/pith-number/UTITDBCIZOOG6ELUTUPTGBCI6T/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/UTITDBCIZOOG6ELUTUPTGBCI6T/action/timestamp_anchor","attest_storage":"https://pith.science/pith/UTITDBCIZOOG6ELUTUPTGBCI6T/action/storage_attestation","attest_author":"https://pith.science/pith/UTITDBCIZOOG6ELUTUPTGBCI6T/action/author_attestation","sign_citation":"https://pith.science/pith/UTITDBCIZOOG6ELUTUPTGBCI6T/action/citation_signature","submit_replication":"https://pith.science/pith/UTITDBCIZOOG6ELUTUPTGBCI6T/action/replication_record"}},"created_at":"2026-05-17T23:58:52.394733+00:00","updated_at":"2026-05-17T23:58:52.394733+00:00"}