{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2016:VKYO5JJYY4YTKQIS7E6EZIU6SL","short_pith_number":"pith:VKYO5JJY","canonical_record":{"source":{"id":"1608.03995","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-08-13T15:54:10Z","cross_cats_sorted":[],"title_canon_sha256":"32604d7a4957d2f1b26e44395aba74ef6cb5ebdc6f5cdf913d3f4bf93f21d85b","abstract_canon_sha256":"57caa151041eb08a82cd9a6dc62f528d02c00e259d6d8858e2d69adf6d3795b9"},"schema_version":"1.0"},"canonical_sha256":"aab0eea538c731354112f93c4ca29e92c49135cbefe8a65f9b7784e859154c80","source":{"kind":"arxiv","id":"1608.03995","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1608.03995","created_at":"2026-05-17T23:46:32Z"},{"alias_kind":"arxiv_version","alias_value":"1608.03995v2","created_at":"2026-05-17T23:46:32Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1608.03995","created_at":"2026-05-17T23:46:32Z"},{"alias_kind":"pith_short_12","alias_value":"VKYO5JJYY4YT","created_at":"2026-05-18T12:30:48Z"},{"alias_kind":"pith_short_16","alias_value":"VKYO5JJYY4YTKQIS","created_at":"2026-05-18T12:30:48Z"},{"alias_kind":"pith_short_8","alias_value":"VKYO5JJY","created_at":"2026-05-18T12:30:48Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2016:VKYO5JJYY4YTKQIS7E6EZIU6SL","target":"record","payload":{"canonical_record":{"source":{"id":"1608.03995","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-08-13T15:54:10Z","cross_cats_sorted":[],"title_canon_sha256":"32604d7a4957d2f1b26e44395aba74ef6cb5ebdc6f5cdf913d3f4bf93f21d85b","abstract_canon_sha256":"57caa151041eb08a82cd9a6dc62f528d02c00e259d6d8858e2d69adf6d3795b9"},"schema_version":"1.0"},"canonical_sha256":"aab0eea538c731354112f93c4ca29e92c49135cbefe8a65f9b7784e859154c80","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:46:32.191021Z","signature_b64":"hZkP1IFOyCSQeZ75+ZdO7fgpiGWncuFMKuIIb0SXLkMqGVJEwAOqr0n6YYvAoHf19g4fDs7N1Jf4cykL7cHiDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"aab0eea538c731354112f93c4ca29e92c49135cbefe8a65f9b7784e859154c80","last_reissued_at":"2026-05-17T23:46:32.190334Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:46:32.190334Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1608.03995","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:46:32Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cm+DH+B8g0F1lWXNWazE0hBkvN6gfxb+le1vJ37T8KsfQBVT6mPebY+FtWIxDhDUli89WUsQhRSZeXxZm9CRBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-31T23:10:52.916183Z"},"content_sha256":"f42382326d8c866fc6ea7288221fbcafe316cad80d7d2ca0bbd78e73e06e0a3d","schema_version":"1.0","event_id":"sha256:f42382326d8c866fc6ea7288221fbcafe316cad80d7d2ca0bbd78e73e06e0a3d"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2016:VKYO5JJYY4YTKQIS7E6EZIU6SL","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"An Analysis of Lemmatization on Topic Models of Morphologically Rich Language","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Benjamin Van Durme, Chandler May, Ryan Cotterell","submitted_at":"2016-08-13T15:54:10Z","abstract_excerpt":"Topic models are typically represented by top-$m$ word lists for human interpretation. The corpus is often pre-processed with lemmatization (or stemming) so that those representations are not undermined by a proliferation of words with similar meanings, but there is little public work on the effects of that pre-processing. Recent work studied the effect of stemming on topic models of English texts and found no supporting evidence for the practice. We study the effect of lemmatization on topic models of Russian Wikipedia articles, finding in one configuration that it significantly improves inte"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1608.03995","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:46:32Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"6tcv30fQDixGpXRql+3M8D7gjilPWfpVIIYQUpbw/VczcSKhKU62CiKkz4KxGMOUmVzzGNPoiAX8ntsNSh38Aw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-31T23:10:52.916895Z"},"content_sha256":"5cf3bb64a70aac3f4de09a679ad09cbfb3725420dad1724dbce62c16cc2aa4f8","schema_version":"1.0","event_id":"sha256:5cf3bb64a70aac3f4de09a679ad09cbfb3725420dad1724dbce62c16cc2aa4f8"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/VKYO5JJYY4YTKQIS7E6EZIU6SL/bundle.json","state_url":"https://pith.science/pith/VKYO5JJYY4YTKQIS7E6EZIU6SL/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/VKYO5JJYY4YTKQIS7E6EZIU6SL/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-31T23:10:52Z","links":{"resolver":"https://pith.science/pith/VKYO5JJYY4YTKQIS7E6EZIU6SL","bundle":"https://pith.science/pith/VKYO5JJYY4YTKQIS7E6EZIU6SL/bundle.json","state":"https://pith.science/pith/VKYO5JJYY4YTKQIS7E6EZIU6SL/state.json","well_known_bundle":"https://pith.science/.well-known/pith/VKYO5JJYY4YTKQIS7E6EZIU6SL/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2016:VKYO5JJYY4YTKQIS7E6EZIU6SL","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"57caa151041eb08a82cd9a6dc62f528d02c00e259d6d8858e2d69adf6d3795b9","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-08-13T15:54:10Z","title_canon_sha256":"32604d7a4957d2f1b26e44395aba74ef6cb5ebdc6f5cdf913d3f4bf93f21d85b"},"schema_version":"1.0","source":{"id":"1608.03995","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1608.03995","created_at":"2026-05-17T23:46:32Z"},{"alias_kind":"arxiv_version","alias_value":"1608.03995v2","created_at":"2026-05-17T23:46:32Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1608.03995","created_at":"2026-05-17T23:46:32Z"},{"alias_kind":"pith_short_12","alias_value":"VKYO5JJYY4YT","created_at":"2026-05-18T12:30:48Z"},{"alias_kind":"pith_short_16","alias_value":"VKYO5JJYY4YTKQIS","created_at":"2026-05-18T12:30:48Z"},{"alias_kind":"pith_short_8","alias_value":"VKYO5JJY","created_at":"2026-05-18T12:30:48Z"}],"graph_snapshots":[{"event_id":"sha256:5cf3bb64a70aac3f4de09a679ad09cbfb3725420dad1724dbce62c16cc2aa4f8","target":"graph","created_at":"2026-05-17T23:46:32Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Topic models are typically represented by top-$m$ word lists for human interpretation. The corpus is often pre-processed with lemmatization (or stemming) so that those representations are not undermined by a proliferation of words with similar meanings, but there is little public work on the effects of that pre-processing. Recent work studied the effect of stemming on topic models of English texts and found no supporting evidence for the practice. We study the effect of lemmatization on topic models of Russian Wikipedia articles, finding in one configuration that it significantly improves inte","authors_text":"Benjamin Van Durme, Chandler May, Ryan Cotterell","cross_cats":[],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-08-13T15:54:10Z","title":"An Analysis of Lemmatization on Topic Models of Morphologically Rich Language"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1608.03995","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f42382326d8c866fc6ea7288221fbcafe316cad80d7d2ca0bbd78e73e06e0a3d","target":"record","created_at":"2026-05-17T23:46:32Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"57caa151041eb08a82cd9a6dc62f528d02c00e259d6d8858e2d69adf6d3795b9","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2016-08-13T15:54:10Z","title_canon_sha256":"32604d7a4957d2f1b26e44395aba74ef6cb5ebdc6f5cdf913d3f4bf93f21d85b"},"schema_version":"1.0","source":{"id":"1608.03995","kind":"arxiv","version":2}},"canonical_sha256":"aab0eea538c731354112f93c4ca29e92c49135cbefe8a65f9b7784e859154c80","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"aab0eea538c731354112f93c4ca29e92c49135cbefe8a65f9b7784e859154c80","first_computed_at":"2026-05-17T23:46:32.190334Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:46:32.190334Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"hZkP1IFOyCSQeZ75+ZdO7fgpiGWncuFMKuIIb0SXLkMqGVJEwAOqr0n6YYvAoHf19g4fDs7N1Jf4cykL7cHiDQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:46:32.191021Z","signed_message":"canonical_sha256_bytes"},"source_id":"1608.03995","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f42382326d8c866fc6ea7288221fbcafe316cad80d7d2ca0bbd78e73e06e0a3d","sha256:5cf3bb64a70aac3f4de09a679ad09cbfb3725420dad1724dbce62c16cc2aa4f8"],"state_sha256":"6acc6ffd86a04d3a9ddd3a367b6f304ba1e9cc116833370c72085779c6349327"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"LNSEzbJDyK3d/FSJqcOYB6BWIefqbEaHMQCDwCMfBloXuUUY8r5R2g8fp+ZAevDBHR/QQztAP7VZ9ZEl7iOCBA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-31T23:10:52.921254Z","bundle_sha256":"f91e175cf9930f34511a652cbfcd7f3aa6485f384c6198cac5918665aea07929"}}