{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2019:C7XXWVHAMRX5K4J6P522DBIZ7S","short_pith_number":"pith:C7XXWVHA","canonical_record":{"source":{"id":"1904.01873","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-04-03T09:27:57Z","cross_cats_sorted":["cs.SE"],"title_canon_sha256":"89bddf1f8e9501fb737c7924af074030333f4057cb597f140f566d93550648e9","abstract_canon_sha256":"78875e90a53e951eed810256e210aa6bad21ef8a9c771075d573fec1434b2b52"},"schema_version":"1.0"},"canonical_sha256":"17ef7b54e0646fd5713e7f75a18519fc8be37b3e500f370d6bbe0b3162a22a24","source":{"kind":"arxiv","id":"1904.01873","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1904.01873","created_at":"2026-05-17T23:49:29Z"},{"alias_kind":"arxiv_version","alias_value":"1904.01873v1","created_at":"2026-05-17T23:49:29Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1904.01873","created_at":"2026-05-17T23:49:29Z"},{"alias_kind":"pith_short_12","alias_value":"C7XXWVHAMRX5","created_at":"2026-05-18T12:33:12Z"},{"alias_kind":"pith_short_16","alias_value":"C7XXWVHAMRX5K4J6","created_at":"2026-05-18T12:33:12Z"},{"alias_kind":"pith_short_8","alias_value":"C7XXWVHA","created_at":"2026-05-18T12:33:12Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2019:C7XXWVHAMRX5K4J6P522DBIZ7S","target":"record","payload":{"canonical_record":{"source":{"id":"1904.01873","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-04-03T09:27:57Z","cross_cats_sorted":["cs.SE"],"title_canon_sha256":"89bddf1f8e9501fb737c7924af074030333f4057cb597f140f566d93550648e9","abstract_canon_sha256":"78875e90a53e951eed810256e210aa6bad21ef8a9c771075d573fec1434b2b52"},"schema_version":"1.0"},"canonical_sha256":"17ef7b54e0646fd5713e7f75a18519fc8be37b3e500f370d6bbe0b3162a22a24","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:49:29.388758Z","signature_b64":"0ML+lG7kTgU93nw/QpvO9eps88D21M6J3NsdorgGEvWX/KVRlI7IjM0Djz9eiE6H5N3hyDawLYmpc4pE35aVBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"17ef7b54e0646fd5713e7f75a18519fc8be37b3e500f370d6bbe0b3162a22a24","last_reissued_at":"2026-05-17T23:49:29.388094Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:49:29.388094Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1904.01873","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:49:29Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"V6hmGiz0T7abP+62oZFB/wg/yjDTdhiZjwBgXbaRQupP7O7OLEgvE+MZ8Sr2FQdmiAYgr0ZPCFXLHh2+75rrDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T02:59:28.304231Z"},"content_sha256":"2eaa47d848e783faeb29d9ffdd25856269299c6c148a783e0f0dccf6f845983e","schema_version":"1.0","event_id":"sha256:2eaa47d848e783faeb29d9ffdd25856269299c6c148a783e0f0dccf6f845983e"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2019:C7XXWVHAMRX5K4J6P522DBIZ7S","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Modeling Vocabulary for Big Code Machine Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.SE"],"primary_cat":"cs.CL","authors_text":"Andrea Janes, Hlib Babii, Romain Robbes","submitted_at":"2019-04-03T09:27:57Z","abstract_excerpt":"When building machine learning models that operate on source code, several decisions have to be made to model source-code vocabulary. These decisions can have a large impact: some can lead to not being able to train models at all, others significantly affect performance, particularly for Neural Language Models. Yet, these decisions are not often fully described. This paper lists important modeling choices for source code vocabulary, and explores their impact on the resulting vocabulary on a large-scale corpus of 14,436 projects. We show that a subset of decisions have decisive characteristics,"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1904.01873","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:49:29Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"LpsYdn0Nd1+3mV6IKNFgdIgmVuEdqf5m/iWmeUrrItvqqHL1ZZ8H4/RxtxR+xAd3YpNwuhNsDjpgV9b63lYZAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T02:59:28.304597Z"},"content_sha256":"725398365f3ceee34b92fbd1d493a6f9a5c5f35d978653cecd4121c19710e8d8","schema_version":"1.0","event_id":"sha256:725398365f3ceee34b92fbd1d493a6f9a5c5f35d978653cecd4121c19710e8d8"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/C7XXWVHAMRX5K4J6P522DBIZ7S/bundle.json","state_url":"https://pith.science/pith/C7XXWVHAMRX5K4J6P522DBIZ7S/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/C7XXWVHAMRX5K4J6P522DBIZ7S/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-02T02:59:28Z","links":{"resolver":"https://pith.science/pith/C7XXWVHAMRX5K4J6P522DBIZ7S","bundle":"https://pith.science/pith/C7XXWVHAMRX5K4J6P522DBIZ7S/bundle.json","state":"https://pith.science/pith/C7XXWVHAMRX5K4J6P522DBIZ7S/state.json","well_known_bundle":"https://pith.science/.well-known/pith/C7XXWVHAMRX5K4J6P522DBIZ7S/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2019:C7XXWVHAMRX5K4J6P522DBIZ7S","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"78875e90a53e951eed810256e210aa6bad21ef8a9c771075d573fec1434b2b52","cross_cats_sorted":["cs.SE"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-04-03T09:27:57Z","title_canon_sha256":"89bddf1f8e9501fb737c7924af074030333f4057cb597f140f566d93550648e9"},"schema_version":"1.0","source":{"id":"1904.01873","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1904.01873","created_at":"2026-05-17T23:49:29Z"},{"alias_kind":"arxiv_version","alias_value":"1904.01873v1","created_at":"2026-05-17T23:49:29Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1904.01873","created_at":"2026-05-17T23:49:29Z"},{"alias_kind":"pith_short_12","alias_value":"C7XXWVHAMRX5","created_at":"2026-05-18T12:33:12Z"},{"alias_kind":"pith_short_16","alias_value":"C7XXWVHAMRX5K4J6","created_at":"2026-05-18T12:33:12Z"},{"alias_kind":"pith_short_8","alias_value":"C7XXWVHA","created_at":"2026-05-18T12:33:12Z"}],"graph_snapshots":[{"event_id":"sha256:725398365f3ceee34b92fbd1d493a6f9a5c5f35d978653cecd4121c19710e8d8","target":"graph","created_at":"2026-05-17T23:49:29Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"When building machine learning models that operate on source code, several decisions have to be made to model source-code vocabulary. These decisions can have a large impact: some can lead to not being able to train models at all, others significantly affect performance, particularly for Neural Language Models. Yet, these decisions are not often fully described. This paper lists important modeling choices for source code vocabulary, and explores their impact on the resulting vocabulary on a large-scale corpus of 14,436 projects. We show that a subset of decisions have decisive characteristics,","authors_text":"Andrea Janes, Hlib Babii, Romain Robbes","cross_cats":["cs.SE"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-04-03T09:27:57Z","title":"Modeling Vocabulary for Big Code Machine Learning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1904.01873","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2eaa47d848e783faeb29d9ffdd25856269299c6c148a783e0f0dccf6f845983e","target":"record","created_at":"2026-05-17T23:49:29Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"78875e90a53e951eed810256e210aa6bad21ef8a9c771075d573fec1434b2b52","cross_cats_sorted":["cs.SE"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-04-03T09:27:57Z","title_canon_sha256":"89bddf1f8e9501fb737c7924af074030333f4057cb597f140f566d93550648e9"},"schema_version":"1.0","source":{"id":"1904.01873","kind":"arxiv","version":1}},"canonical_sha256":"17ef7b54e0646fd5713e7f75a18519fc8be37b3e500f370d6bbe0b3162a22a24","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"17ef7b54e0646fd5713e7f75a18519fc8be37b3e500f370d6bbe0b3162a22a24","first_computed_at":"2026-05-17T23:49:29.388094Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:49:29.388094Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"0ML+lG7kTgU93nw/QpvO9eps88D21M6J3NsdorgGEvWX/KVRlI7IjM0Djz9eiE6H5N3hyDawLYmpc4pE35aVBA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:49:29.388758Z","signed_message":"canonical_sha256_bytes"},"source_id":"1904.01873","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2eaa47d848e783faeb29d9ffdd25856269299c6c148a783e0f0dccf6f845983e","sha256:725398365f3ceee34b92fbd1d493a6f9a5c5f35d978653cecd4121c19710e8d8"],"state_sha256":"3406a710a2bb6dd8b253a093a1ef4163223cc2f76c43b9842bfcb9f027062e87"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"KZE0HvD9Ods9+VsL6oI2QMW7+i99yPrp5gTxQPcCYO0DRpDQgDJYS/nRc/D+RzPgvdseL3WOO4HE/qU+rtH+Cg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-02T02:59:28.306470Z","bundle_sha256":"19526a0d6a734a061cb6a5280940d9e7993f974906bb937f04d1e9c69ba5a901"}}