{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:XRGIFYPBHSS6UPYMDKO5FHVVF4","short_pith_number":"pith:XRGIFYPB","schema_version":"1.0","canonical_sha256":"bc4c82e1e13ca5ea3f0c1a9dd29eb52f39126ec1fe84206e084616e41f873047","source":{"kind":"arxiv","id":"1903.12136","version":1},"attestation_state":"computed","paper":{"title":"Distilling Task-Specific Knowledge from BERT into Simple Neural Networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Jimmy Lin, Lili Mou, Linqing Liu, Olga Vechtomova, Raphael Tang, Yao Lu","submitted_at":"2019-03-28T17:23:50Z","abstract_excerpt":"In the natural language processing literature, neural networks are becoming increasingly deeper and complex. The recent poster child of this trend is the deep language representation model, which includes BERT, ELMo, and GPT. These developments have led to the conviction that previous-generation, shallower neural networks for language understanding are obsolete. In this paper, however, we demonstrate that rudimentary, lightweight neural networks can still be made competitive without architecture changes, external training data, or additional input features. We propose to distill knowledge from"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1903.12136","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-03-28T17:23:50Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"4c115290a20526281d3e328e9d3128cadb6d2306b264a9f50ebb5e3162c41a58","abstract_canon_sha256":"b23caa599a9bac524296ce34a426d86b0b02282bced1f9baf0b0a827aa38061c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:49:57.944579Z","signature_b64":"7lFWe6eK5nGpDMD39jOEshAy7jJ59SvRJRPjhVvsKbBJRdAIXGNtmk0F+JL0oecjc83TZd6cgXg981ZJ1tC8Cw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"bc4c82e1e13ca5ea3f0c1a9dd29eb52f39126ec1fe84206e084616e41f873047","last_reissued_at":"2026-05-17T23:49:57.943922Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:49:57.943922Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Distilling Task-Specific Knowledge from BERT into Simple Neural Networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Jimmy Lin, Lili Mou, Linqing Liu, Olga Vechtomova, Raphael Tang, Yao Lu","submitted_at":"2019-03-28T17:23:50Z","abstract_excerpt":"In the natural language processing literature, neural networks are becoming increasingly deeper and complex. The recent poster child of this trend is the deep language representation model, which includes BERT, ELMo, and GPT. These developments have led to the conviction that previous-generation, shallower neural networks for language understanding are obsolete. In this paper, however, we demonstrate that rudimentary, lightweight neural networks can still be made competitive without architecture changes, external training data, or additional input features. We propose to distill knowledge from"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1903.12136","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1903.12136","created_at":"2026-05-17T23:49:57.944011+00:00"},{"alias_kind":"arxiv_version","alias_value":"1903.12136v1","created_at":"2026-05-17T23:49:57.944011+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1903.12136","created_at":"2026-05-17T23:49:57.944011+00:00"},{"alias_kind":"pith_short_12","alias_value":"XRGIFYPBHSS6","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"XRGIFYPBHSS6UPYM","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"XRGIFYPB","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":10,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2305.02301","citing_title":"Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes","ref_index":101,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15104","citing_title":"From Text to Voice: A Reproducible and Verifiable Framework for Evaluating Tool Calling LLM Agents","ref_index":66,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10933","citing_title":"DECO: Sparse Mixture-of-Experts with Dense-Comparable Performance on End-Side Devices","ref_index":66,"is_internal_anchor":true},{"citing_arxiv_id":"2508.03949","citing_title":"Model Compression vs. Adversarial Robustness: An Empirical Study on Language Models for Code","ref_index":70,"is_internal_anchor":true},{"citing_arxiv_id":"2511.05476","citing_title":"A Metamorphic Testing Perspective on Knowledge Distillation for Language Models of Code: Does the Student Deeply Mimic the Teacher?","ref_index":83,"is_internal_anchor":true},{"citing_arxiv_id":"2306.14048","citing_title":"H$_2$O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models","ref_index":65,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10933","citing_title":"DECO: Sparse Mixture-of-Experts with Dense-Comparable Performance on End-Side Devices","ref_index":66,"is_internal_anchor":false},{"citing_arxiv_id":"2605.10933","citing_title":"DECO: Sparse Mixture-of-Experts with Dense-Comparable Performance on End-Side Devices","ref_index":66,"is_internal_anchor":false},{"citing_arxiv_id":"2604.25903","citing_title":"Carbon-Taxed Transformers: A Green Compression Pipeline for Overgrown Language Models","ref_index":59,"is_internal_anchor":false},{"citing_arxiv_id":"1910.01108","citing_title":"DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter","ref_index":43,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XRGIFYPBHSS6UPYMDKO5FHVVF4","json":"https://pith.science/pith/XRGIFYPBHSS6UPYMDKO5FHVVF4.json","graph_json":"https://pith.science/api/pith-number/XRGIFYPBHSS6UPYMDKO5FHVVF4/graph.json","events_json":"https://pith.science/api/pith-number/XRGIFYPBHSS6UPYMDKO5FHVVF4/events.json","paper":"https://pith.science/paper/XRGIFYPB"},"agent_actions":{"view_html":"https://pith.science/pith/XRGIFYPBHSS6UPYMDKO5FHVVF4","download_json":"https://pith.science/pith/XRGIFYPBHSS6UPYMDKO5FHVVF4.json","view_paper":"https://pith.science/paper/XRGIFYPB","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1903.12136&json=true","fetch_graph":"https://pith.science/api/pith-number/XRGIFYPBHSS6UPYMDKO5FHVVF4/graph.json","fetch_events":"https://pith.science/api/pith-number/XRGIFYPBHSS6UPYMDKO5FHVVF4/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XRGIFYPBHSS6UPYMDKO5FHVVF4/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XRGIFYPBHSS6UPYMDKO5FHVVF4/action/storage_attestation","attest_author":"https://pith.science/pith/XRGIFYPBHSS6UPYMDKO5FHVVF4/action/author_attestation","sign_citation":"https://pith.science/pith/XRGIFYPBHSS6UPYMDKO5FHVVF4/action/citation_signature","submit_replication":"https://pith.science/pith/XRGIFYPBHSS6UPYMDKO5FHVVF4/action/replication_record"}},"created_at":"2026-05-17T23:49:57.944011+00:00","updated_at":"2026-05-17T23:49:57.944011+00:00"}