{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:XVLHF5BO3STBCUQUTM5MEMU5FA","short_pith_number":"pith:XVLHF5BO","schema_version":"1.0","canonical_sha256":"bd5672f42edca61152149b3ac2329d2837827f107f28cdca696924c265e9e803","source":{"kind":"arxiv","id":"2310.11453","version":1},"attestation_state":"computed","paper":{"title":"BitNet: Scaling 1-bit Transformers for Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Fan Yang, Furu Wei, Hongyu Wang, Huaijie Wang, Li Dong, Lingxiao Ma, Ruiping Wang, Shaohan Huang, Shuming Ma, Yi Wu","submitted_at":"2023-10-17T17:59:15Z","abstract_excerpt":"The increasing size of large language models has posed challenges for deployment and raised concerns about environmental impact due to high energy consumption. In this work, we introduce BitNet, a scalable and stable 1-bit Transformer architecture designed for large language models. Specifically, we introduce BitLinear as a drop-in replacement of the nn.Linear layer in order to train 1-bit weights from scratch. Experimental results on language modeling show that BitNet achieves competitive performance while substantially reducing memory footprint and energy consumption, compared to state-of-th"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2310.11453","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-10-17T17:59:15Z","cross_cats_sorted":[],"title_canon_sha256":"e5e311727e3b305fe089fa7d47a42678623c85f86304742d628bd1dce40527df","abstract_canon_sha256":"3d98f38b3935873101f9c6262852ebccb0ebd1cf96e18061bb3bd2c8e7e60a20"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.364880Z","signature_b64":"MnYE5O3jESDJfH54EWz8IvdxNHozguKk1FoB3Qag46I1UY1sTnJRWpIzLKfKy9+C2OsOlJbMffLUZptyyLEzBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"bd5672f42edca61152149b3ac2329d2837827f107f28cdca696924c265e9e803","last_reissued_at":"2026-05-17T23:38:13.363666Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.363666Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"BitNet: Scaling 1-bit Transformers for Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Fan Yang, Furu Wei, Hongyu Wang, Huaijie Wang, Li Dong, Lingxiao Ma, Ruiping Wang, Shaohan Huang, Shuming Ma, Yi Wu","submitted_at":"2023-10-17T17:59:15Z","abstract_excerpt":"The increasing size of large language models has posed challenges for deployment and raised concerns about environmental impact due to high energy consumption. In this work, we introduce BitNet, a scalable and stable 1-bit Transformer architecture designed for large language models. Specifically, we introduce BitLinear as a drop-in replacement of the nn.Linear layer in order to train 1-bit weights from scratch. Experimental results on language modeling show that BitNet achieves competitive performance while substantially reducing memory footprint and energy consumption, compared to state-of-th"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2310.11453","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2310.11453","created_at":"2026-05-17T23:38:13.363772+00:00"},{"alias_kind":"arxiv_version","alias_value":"2310.11453v1","created_at":"2026-05-17T23:38:13.363772+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2310.11453","created_at":"2026-05-17T23:38:13.363772+00:00"},{"alias_kind":"pith_short_12","alias_value":"XVLHF5BO3STB","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"XVLHF5BO3STBCUQU","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"XVLHF5BO","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":17,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2402.17764","citing_title":"The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2512.06443","citing_title":"Vec-LUT: Vector Table Lookup for Parallel Ultra-Low-Bit LLM Inference on Edge Devices","ref_index":41,"is_internal_anchor":false},{"citing_arxiv_id":"2512.21651","citing_title":"Rethinking Output Alignment For 1-bit Post-Training Quantization of Large Language Models","ref_index":16,"is_internal_anchor":false},{"citing_arxiv_id":"2602.05243","citing_title":"CORP: Closed-Form One-shot Representation-Preserving Structured Pruning for Transformers","ref_index":21,"is_internal_anchor":false},{"citing_arxiv_id":"2602.06252","citing_title":"D-Legion: A Scalable Many-Core Architecture for Accelerating Matrix Multiplication in Quantized LLMs","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2603.18104","citing_title":"Adaptive Domain Models: Bayesian Evolution, Warm Rotation, and Principled Training for Geometric and Neuromorphic AI","ref_index":27,"is_internal_anchor":false},{"citing_arxiv_id":"2605.13859","citing_title":"BiSpikCLM: A Spiking Language Model integrating Softmax-Free Spiking Attention and Spike-Aware Alignment Distillation","ref_index":16,"is_internal_anchor":false},{"citing_arxiv_id":"2604.03957","citing_title":"BWTA: Accurate and Efficient Binarized Transformer by Algorithm-Hardware Co-design","ref_index":27,"is_internal_anchor":false},{"citing_arxiv_id":"2605.11558","citing_title":"A Composite Activation Function for Learning Stable Binary Representations","ref_index":70,"is_internal_anchor":false},{"citing_arxiv_id":"2605.09425","citing_title":"AtteConDA: Attention-Based Conflict Suppression in Multi-Condition Diffusion Models and Synthetic Data Augmentation","ref_index":81,"is_internal_anchor":false},{"citing_arxiv_id":"2605.08755","citing_title":"LAQuant: A Simple Overhead-free Large Reasoning Model Quantization by Layer-wise Lookahead Loss","ref_index":50,"is_internal_anchor":false},{"citing_arxiv_id":"2604.24273","citing_title":"BitRL: Reinforcement Learning with 1-bit Quantized Language Models for Resource-Constrained Edge Deployment","ref_index":19,"is_internal_anchor":false},{"citing_arxiv_id":"2605.06485","citing_title":"Litespark Inference on Consumer CPUs: Custom SIMD Kernels for Ternary Neural Networks","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2604.19167","citing_title":"LBLLM: Lightweight Binarization of Large Language Models via Three-Stage Distillation","ref_index":21,"is_internal_anchor":false},{"citing_arxiv_id":"2604.10091","citing_title":"SEPTQ: A Simple and Effective Post-Training Quantization Paradigm for Large Language Models","ref_index":39,"is_internal_anchor":false},{"citing_arxiv_id":"2604.06836","citing_title":"STQuant: Spatio-Temporal Adaptive Framework for Optimizer Quantization in Large Multimodal Model Training","ref_index":26,"is_internal_anchor":false},{"citing_arxiv_id":"2604.18556","citing_title":"GSQ: Highly-Accurate Low-Precision Scalar Quantization for LLMs via Gumbel-Softmax Sampling","ref_index":32,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XVLHF5BO3STBCUQUTM5MEMU5FA","json":"https://pith.science/pith/XVLHF5BO3STBCUQUTM5MEMU5FA.json","graph_json":"https://pith.science/api/pith-number/XVLHF5BO3STBCUQUTM5MEMU5FA/graph.json","events_json":"https://pith.science/api/pith-number/XVLHF5BO3STBCUQUTM5MEMU5FA/events.json","paper":"https://pith.science/paper/XVLHF5BO"},"agent_actions":{"view_html":"https://pith.science/pith/XVLHF5BO3STBCUQUTM5MEMU5FA","download_json":"https://pith.science/pith/XVLHF5BO3STBCUQUTM5MEMU5FA.json","view_paper":"https://pith.science/paper/XVLHF5BO","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2310.11453&json=true","fetch_graph":"https://pith.science/api/pith-number/XVLHF5BO3STBCUQUTM5MEMU5FA/graph.json","fetch_events":"https://pith.science/api/pith-number/XVLHF5BO3STBCUQUTM5MEMU5FA/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XVLHF5BO3STBCUQUTM5MEMU5FA/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XVLHF5BO3STBCUQUTM5MEMU5FA/action/storage_attestation","attest_author":"https://pith.science/pith/XVLHF5BO3STBCUQUTM5MEMU5FA/action/author_attestation","sign_citation":"https://pith.science/pith/XVLHF5BO3STBCUQUTM5MEMU5FA/action/citation_signature","submit_replication":"https://pith.science/pith/XVLHF5BO3STBCUQUTM5MEMU5FA/action/replication_record"}},"created_at":"2026-05-17T23:38:13.363772+00:00","updated_at":"2026-05-17T23:38:13.363772+00:00"}