{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:ZYDYXM6JT63ISAEYIKE5OVK2MJ","short_pith_number":"pith:ZYDYXM6J","schema_version":"1.0","canonical_sha256":"ce078bb3c99fb68900984289d7555a62645dc814d299cd54aea520517da382f5","source":{"kind":"arxiv","id":"1904.13389","version":1},"attestation_state":"computed","paper":{"title":"Categorical Feature Compression via Submodular Optimization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.DS","cs.IT","math.IT","stat.ML"],"primary_cat":"cs.LG","authors_text":"Afshin Rostamizadeh, Hossein Esfandiari, Lin Chen, MohammadHossein Bateni, Thomas Fu, Vahab S. Mirrokni","submitted_at":"2019-04-30T17:45:13Z","abstract_excerpt":"In the era of big data, learning from categorical features with very large vocabularies (e.g., 28 million for the Criteo click prediction dataset) has become a practical challenge for machine learning researchers and practitioners. We design a highly-scalable vocabulary compression algorithm that seeks to maximize the mutual information between the compressed categorical feature and the target binary labels and we furthermore show that its solution is guaranteed to be within a $1-1/e \\approx 63\\%$ factor of the global optimal solution. To achieve this, we introduce a novel re-parametrization o"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1904.13389","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-04-30T17:45:13Z","cross_cats_sorted":["cs.AI","cs.DS","cs.IT","math.IT","stat.ML"],"title_canon_sha256":"0d623f11816761f16041308672e19a73b0342d88e6fd46afe004105226429848","abstract_canon_sha256":"c4873bf31eb59eeec64369f5c550cb117d885ca39499d0cf6528e74b44f9a766"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:47:21.466612Z","signature_b64":"VFmtpFi9tNbUr7RbPIWv6mj3+XP78W2HE1W/LQURYcvWoLzn+mdCOBG57r3UmKrH1kexN6zlx92sWjcmwbenAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ce078bb3c99fb68900984289d7555a62645dc814d299cd54aea520517da382f5","last_reissued_at":"2026-05-17T23:47:21.466166Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:47:21.466166Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Categorical Feature Compression via Submodular Optimization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.DS","cs.IT","math.IT","stat.ML"],"primary_cat":"cs.LG","authors_text":"Afshin Rostamizadeh, Hossein Esfandiari, Lin Chen, MohammadHossein Bateni, Thomas Fu, Vahab S. Mirrokni","submitted_at":"2019-04-30T17:45:13Z","abstract_excerpt":"In the era of big data, learning from categorical features with very large vocabularies (e.g., 28 million for the Criteo click prediction dataset) has become a practical challenge for machine learning researchers and practitioners. We design a highly-scalable vocabulary compression algorithm that seeks to maximize the mutual information between the compressed categorical feature and the target binary labels and we furthermore show that its solution is guaranteed to be within a $1-1/e \\approx 63\\%$ factor of the global optimal solution. To achieve this, we introduce a novel re-parametrization o"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1904.13389","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1904.13389","created_at":"2026-05-17T23:47:21.466232+00:00"},{"alias_kind":"arxiv_version","alias_value":"1904.13389v1","created_at":"2026-05-17T23:47:21.466232+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1904.13389","created_at":"2026-05-17T23:47:21.466232+00:00"},{"alias_kind":"pith_short_12","alias_value":"ZYDYXM6JT63I","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"ZYDYXM6JT63ISAEY","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"ZYDYXM6J","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ZYDYXM6JT63ISAEYIKE5OVK2MJ","json":"https://pith.science/pith/ZYDYXM6JT63ISAEYIKE5OVK2MJ.json","graph_json":"https://pith.science/api/pith-number/ZYDYXM6JT63ISAEYIKE5OVK2MJ/graph.json","events_json":"https://pith.science/api/pith-number/ZYDYXM6JT63ISAEYIKE5OVK2MJ/events.json","paper":"https://pith.science/paper/ZYDYXM6J"},"agent_actions":{"view_html":"https://pith.science/pith/ZYDYXM6JT63ISAEYIKE5OVK2MJ","download_json":"https://pith.science/pith/ZYDYXM6JT63ISAEYIKE5OVK2MJ.json","view_paper":"https://pith.science/paper/ZYDYXM6J","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1904.13389&json=true","fetch_graph":"https://pith.science/api/pith-number/ZYDYXM6JT63ISAEYIKE5OVK2MJ/graph.json","fetch_events":"https://pith.science/api/pith-number/ZYDYXM6JT63ISAEYIKE5OVK2MJ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ZYDYXM6JT63ISAEYIKE5OVK2MJ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ZYDYXM6JT63ISAEYIKE5OVK2MJ/action/storage_attestation","attest_author":"https://pith.science/pith/ZYDYXM6JT63ISAEYIKE5OVK2MJ/action/author_attestation","sign_citation":"https://pith.science/pith/ZYDYXM6JT63ISAEYIKE5OVK2MJ/action/citation_signature","submit_replication":"https://pith.science/pith/ZYDYXM6JT63ISAEYIKE5OVK2MJ/action/replication_record"}},"created_at":"2026-05-17T23:47:21.466232+00:00","updated_at":"2026-05-17T23:47:21.466232+00:00"}