{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2013:VT2GMQWQBWXADTBY4WVS6VVJNI","short_pith_number":"pith:VT2GMQWQ","schema_version":"1.0","canonical_sha256":"acf46642d00dae01cc38e5ab2f56a96a014574cb08e6cb41f78534015e41a537","source":{"kind":"arxiv","id":"1301.6770","version":1},"attestation_state":"computed","paper":{"title":"An alternative text representation to TF-IDF and Bag-of-Words","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.IR","authors_text":"Fei Sha, Kilian Q. Weinberger, Minmin Chen, Zhixiang (Eddie) Xu","submitted_at":"2013-01-28T21:04:45Z","abstract_excerpt":"In text mining, information retrieval, and machine learning, text documents are commonly represented through variants of sparse Bag of Words (sBoW) vectors (e.g. TF-IDF). Although simple and intuitive, sBoW style representations suffer from their inherent over-sparsity and fail to capture word-level synonymy and polysemy. Especially when labeled data is limited (e.g. in document classification), or the text documents are short (e.g. emails or abstracts), many features are rarely observed within the training corpus. This leads to overfitting and reduced generalization accuracy. In this paper we"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1301.6770","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2013-01-28T21:04:45Z","cross_cats_sorted":["cs.LG","stat.ML"],"title_canon_sha256":"c21f3a44fbde509dddd0a3efbd9191b5eb1bf1e694583c7ee7ba4b77f0920714","abstract_canon_sha256":"012f4d5c9fec9f2e1bb33e89a48d9c35c36fe5fe11741fb82e25d99c0659884c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:35:12.415185Z","signature_b64":"JcR7B/tYpJ6yfcSwibEMWq1EIFeQhJ5ad7wIX4REteAIr1/z2Tv7I3xONpRG9A3C+UdUec5QIAVlEoyUrFPcBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"acf46642d00dae01cc38e5ab2f56a96a014574cb08e6cb41f78534015e41a537","last_reissued_at":"2026-05-18T03:35:12.414177Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:35:12.414177Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"An alternative text representation to TF-IDF and Bag-of-Words","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.IR","authors_text":"Fei Sha, Kilian Q. Weinberger, Minmin Chen, Zhixiang (Eddie) Xu","submitted_at":"2013-01-28T21:04:45Z","abstract_excerpt":"In text mining, information retrieval, and machine learning, text documents are commonly represented through variants of sparse Bag of Words (sBoW) vectors (e.g. TF-IDF). Although simple and intuitive, sBoW style representations suffer from their inherent over-sparsity and fail to capture word-level synonymy and polysemy. Especially when labeled data is limited (e.g. in document classification), or the text documents are short (e.g. emails or abstracts), many features are rarely observed within the training corpus. This leads to overfitting and reduced generalization accuracy. In this paper we"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1301.6770","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1301.6770","created_at":"2026-05-18T03:35:12.414329+00:00"},{"alias_kind":"arxiv_version","alias_value":"1301.6770v1","created_at":"2026-05-18T03:35:12.414329+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1301.6770","created_at":"2026-05-18T03:35:12.414329+00:00"},{"alias_kind":"pith_short_12","alias_value":"VT2GMQWQBWXA","created_at":"2026-05-18T12:28:04.890932+00:00"},{"alias_kind":"pith_short_16","alias_value":"VT2GMQWQBWXADTBY","created_at":"2026-05-18T12:28:04.890932+00:00"},{"alias_kind":"pith_short_8","alias_value":"VT2GMQWQ","created_at":"2026-05-18T12:28:04.890932+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VT2GMQWQBWXADTBY4WVS6VVJNI","json":"https://pith.science/pith/VT2GMQWQBWXADTBY4WVS6VVJNI.json","graph_json":"https://pith.science/api/pith-number/VT2GMQWQBWXADTBY4WVS6VVJNI/graph.json","events_json":"https://pith.science/api/pith-number/VT2GMQWQBWXADTBY4WVS6VVJNI/events.json","paper":"https://pith.science/paper/VT2GMQWQ"},"agent_actions":{"view_html":"https://pith.science/pith/VT2GMQWQBWXADTBY4WVS6VVJNI","download_json":"https://pith.science/pith/VT2GMQWQBWXADTBY4WVS6VVJNI.json","view_paper":"https://pith.science/paper/VT2GMQWQ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1301.6770&json=true","fetch_graph":"https://pith.science/api/pith-number/VT2GMQWQBWXADTBY4WVS6VVJNI/graph.json","fetch_events":"https://pith.science/api/pith-number/VT2GMQWQBWXADTBY4WVS6VVJNI/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VT2GMQWQBWXADTBY4WVS6VVJNI/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VT2GMQWQBWXADTBY4WVS6VVJNI/action/storage_attestation","attest_author":"https://pith.science/pith/VT2GMQWQBWXADTBY4WVS6VVJNI/action/author_attestation","sign_citation":"https://pith.science/pith/VT2GMQWQBWXADTBY4WVS6VVJNI/action/citation_signature","submit_replication":"https://pith.science/pith/VT2GMQWQBWXADTBY4WVS6VVJNI/action/replication_record"}},"created_at":"2026-05-18T03:35:12.414329+00:00","updated_at":"2026-05-18T03:35:12.414329+00:00"}