{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:SBAD6ASSHO6AY6DJEAZUYI6PUA","short_pith_number":"pith:SBAD6ASS","schema_version":"1.0","canonical_sha256":"90403f02523bbc0c786920334c23cfa03321f691dbf5d3235016ab8fc06bc42b","source":{"kind":"arxiv","id":"1809.10853","version":3},"attestation_state":"computed","paper":{"title":"Adaptive Input Representations for Neural Language Modeling","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Alexei Baevski, Michael Auli","submitted_at":"2018-09-28T04:30:11Z","abstract_excerpt":"We introduce adaptive input representations for neural language modeling which extend the adaptive softmax of Grave et al. (2017) to input representations of variable capacity. There are several choices on how to factorize the input and output layers, and whether to model words, characters or sub-word units. We perform a systematic comparison of popular choices for a self-attentional architecture. Our experiments show that models equipped with adaptive embeddings are more than twice as fast to train than the popular character input CNN while having a lower number of parameters. On the WikiText"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1809.10853","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-09-28T04:30:11Z","cross_cats_sorted":[],"title_canon_sha256":"0178b5103fe04c7003bd141803521f3743af466f7366393c9fb3a9040e50e63d","abstract_canon_sha256":"56b6cb0f235cef9a4ad9a464f40eec55c9f69ed2f83741c46cb918a5b22d07ee"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:52:52.117686Z","signature_b64":"rhqY94RxwOYlWduXYYtanNRltphsiHhKD9h0ldPMkO0v8kbgo6hhWVZTLw6lgWEunQuqyV37GythEd/6GDLsCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"90403f02523bbc0c786920334c23cfa03321f691dbf5d3235016ab8fc06bc42b","last_reissued_at":"2026-05-17T23:52:52.116878Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:52:52.116878Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Adaptive Input Representations for Neural Language Modeling","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Alexei Baevski, Michael Auli","submitted_at":"2018-09-28T04:30:11Z","abstract_excerpt":"We introduce adaptive input representations for neural language modeling which extend the adaptive softmax of Grave et al. (2017) to input representations of variable capacity. There are several choices on how to factorize the input and output layers, and whether to model words, characters or sub-word units. We perform a systematic comparison of popular choices for a self-attentional architecture. Our experiments show that models equipped with adaptive embeddings are more than twice as fast to train than the popular character input CNN while having a lower number of parameters. On the WikiText"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1809.10853","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1809.10853","created_at":"2026-05-17T23:52:52.117016+00:00"},{"alias_kind":"arxiv_version","alias_value":"1809.10853v3","created_at":"2026-05-17T23:52:52.117016+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1809.10853","created_at":"2026-05-17T23:52:52.117016+00:00"},{"alias_kind":"pith_short_12","alias_value":"SBAD6ASSHO6A","created_at":"2026-05-18T12:32:50.500415+00:00"},{"alias_kind":"pith_short_16","alias_value":"SBAD6ASSHO6AY6DJ","created_at":"2026-05-18T12:32:50.500415+00:00"},{"alias_kind":"pith_short_8","alias_value":"SBAD6ASS","created_at":"2026-05-18T12:32:50.500415+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":11,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2311.04799","citing_title":"DA-Cramming: Enhancing Cost-Effective Language Model Pretraining with Dependency Agreement Integration","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2502.00816","citing_title":"Sundial: A Family of Highly Capable Time Series Foundation Models","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2307.06435","citing_title":"A Comprehensive Overview of Large Language Models","ref_index":78,"is_internal_anchor":true},{"citing_arxiv_id":"1911.05507","citing_title":"Compressive Transformers for Long-Range Sequence Modelling","ref_index":110,"is_internal_anchor":true},{"citing_arxiv_id":"1906.08237","citing_title":"XLNet: Generalized Autoregressive Pretraining for Language Understanding","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2401.15947","citing_title":"MoE-LLaVA: Mixture of Experts for Large Vision-Language Models","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2208.07339","citing_title":"LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale","ref_index":19,"is_internal_anchor":false},{"citing_arxiv_id":"1909.11942","citing_title":"ALBERT: A Lite BERT for Self-supervised Learning of Language Representations","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2108.12409","citing_title":"Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2111.00396","citing_title":"Efficiently Modeling Long Sequences with Structured State Spaces","ref_index":2,"is_internal_anchor":false},{"citing_arxiv_id":"2604.21724","citing_title":"Beyond N-gram: Data-Aware X-GRAM Extraction for Efficient Embedding Parameter Scaling","ref_index":11,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/SBAD6ASSHO6AY6DJEAZUYI6PUA","json":"https://pith.science/pith/SBAD6ASSHO6AY6DJEAZUYI6PUA.json","graph_json":"https://pith.science/api/pith-number/SBAD6ASSHO6AY6DJEAZUYI6PUA/graph.json","events_json":"https://pith.science/api/pith-number/SBAD6ASSHO6AY6DJEAZUYI6PUA/events.json","paper":"https://pith.science/paper/SBAD6ASS"},"agent_actions":{"view_html":"https://pith.science/pith/SBAD6ASSHO6AY6DJEAZUYI6PUA","download_json":"https://pith.science/pith/SBAD6ASSHO6AY6DJEAZUYI6PUA.json","view_paper":"https://pith.science/paper/SBAD6ASS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1809.10853&json=true","fetch_graph":"https://pith.science/api/pith-number/SBAD6ASSHO6AY6DJEAZUYI6PUA/graph.json","fetch_events":"https://pith.science/api/pith-number/SBAD6ASSHO6AY6DJEAZUYI6PUA/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/SBAD6ASSHO6AY6DJEAZUYI6PUA/action/timestamp_anchor","attest_storage":"https://pith.science/pith/SBAD6ASSHO6AY6DJEAZUYI6PUA/action/storage_attestation","attest_author":"https://pith.science/pith/SBAD6ASSHO6AY6DJEAZUYI6PUA/action/author_attestation","sign_citation":"https://pith.science/pith/SBAD6ASSHO6AY6DJEAZUYI6PUA/action/citation_signature","submit_replication":"https://pith.science/pith/SBAD6ASSHO6AY6DJEAZUYI6PUA/action/replication_record"}},"created_at":"2026-05-17T23:52:52.117016+00:00","updated_at":"2026-05-17T23:52:52.117016+00:00"}