{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:26OCQ4RSDN23K3XMUDSCUAV3YF","short_pith_number":"pith:26OCQ4RS","schema_version":"1.0","canonical_sha256":"d79c2872321b75b56eeca0e42a02bbc15ffa6ec6d19e0b40c285bd5bc2d7969f","source":{"kind":"arxiv","id":"1712.10054","version":1},"attestation_state":"computed","paper":{"title":"Corpus specificity in LSA and Word2vec: the role of out-of-domain documents","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Diego Fernandez Slezak, Edgar Altszyler, Mariano Sigman","submitted_at":"2017-12-28T20:56:16Z","abstract_excerpt":"Latent Semantic Analysis (LSA) and Word2vec are some of the most widely used word embeddings. Despite the popularity of these techniques, the precise mechanisms by which they acquire new semantic relations between words remain unclear. In the present article we investigate whether LSA and Word2vec capacity to identify relevant semantic dimensions increases with size of corpus. One intuitive hypothesis is that the capacity to identify relevant dimensions should increase as the amount of data increases. However, if corpus size grow in topics which are not specific to the domain of interest, sign"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1712.10054","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2017-12-28T20:56:16Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"4ef3a60d67b3f4ff1eb846f3c0f338e143b3fba4f46415e646d55c47c9cfcb49","abstract_canon_sha256":"c738efabd66a84ca6f2a2ff16f8aecf966900ea2b3edbc018d97d875baaf64a0"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:55:59.691905Z","signature_b64":"g+SbeLt7T4ciXxROodRoKsxzcrlNImcam5gIi4engA0PA63X3MmLY9Vdtiygz09Z6L1eao/dlq9q3K5dSAs/DA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d79c2872321b75b56eeca0e42a02bbc15ffa6ec6d19e0b40c285bd5bc2d7969f","last_reissued_at":"2026-05-17T23:55:59.691237Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:55:59.691237Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Corpus specificity in LSA and Word2vec: the role of out-of-domain documents","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Diego Fernandez Slezak, Edgar Altszyler, Mariano Sigman","submitted_at":"2017-12-28T20:56:16Z","abstract_excerpt":"Latent Semantic Analysis (LSA) and Word2vec are some of the most widely used word embeddings. Despite the popularity of these techniques, the precise mechanisms by which they acquire new semantic relations between words remain unclear. In the present article we investigate whether LSA and Word2vec capacity to identify relevant semantic dimensions increases with size of corpus. One intuitive hypothesis is that the capacity to identify relevant dimensions should increase as the amount of data increases. However, if corpus size grow in topics which are not specific to the domain of interest, sign"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1712.10054","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1712.10054","created_at":"2026-05-17T23:55:59.691341+00:00"},{"alias_kind":"arxiv_version","alias_value":"1712.10054v1","created_at":"2026-05-17T23:55:59.691341+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1712.10054","created_at":"2026-05-17T23:55:59.691341+00:00"},{"alias_kind":"pith_short_12","alias_value":"26OCQ4RSDN23","created_at":"2026-05-18T12:30:55.937587+00:00"},{"alias_kind":"pith_short_16","alias_value":"26OCQ4RSDN23K3XM","created_at":"2026-05-18T12:30:55.937587+00:00"},{"alias_kind":"pith_short_8","alias_value":"26OCQ4RS","created_at":"2026-05-18T12:30:55.937587+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/26OCQ4RSDN23K3XMUDSCUAV3YF","json":"https://pith.science/pith/26OCQ4RSDN23K3XMUDSCUAV3YF.json","graph_json":"https://pith.science/api/pith-number/26OCQ4RSDN23K3XMUDSCUAV3YF/graph.json","events_json":"https://pith.science/api/pith-number/26OCQ4RSDN23K3XMUDSCUAV3YF/events.json","paper":"https://pith.science/paper/26OCQ4RS"},"agent_actions":{"view_html":"https://pith.science/pith/26OCQ4RSDN23K3XMUDSCUAV3YF","download_json":"https://pith.science/pith/26OCQ4RSDN23K3XMUDSCUAV3YF.json","view_paper":"https://pith.science/paper/26OCQ4RS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1712.10054&json=true","fetch_graph":"https://pith.science/api/pith-number/26OCQ4RSDN23K3XMUDSCUAV3YF/graph.json","fetch_events":"https://pith.science/api/pith-number/26OCQ4RSDN23K3XMUDSCUAV3YF/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/26OCQ4RSDN23K3XMUDSCUAV3YF/action/timestamp_anchor","attest_storage":"https://pith.science/pith/26OCQ4RSDN23K3XMUDSCUAV3YF/action/storage_attestation","attest_author":"https://pith.science/pith/26OCQ4RSDN23K3XMUDSCUAV3YF/action/author_attestation","sign_citation":"https://pith.science/pith/26OCQ4RSDN23K3XMUDSCUAV3YF/action/citation_signature","submit_replication":"https://pith.science/pith/26OCQ4RSDN23K3XMUDSCUAV3YF/action/replication_record"}},"created_at":"2026-05-17T23:55:59.691341+00:00","updated_at":"2026-05-17T23:55:59.691341+00:00"}