{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:3S5Y6A5LN45KO7NVCEOOCLNFD5","short_pith_number":"pith:3S5Y6A5L","schema_version":"1.0","canonical_sha256":"dcbb8f03ab6f3aa77db5111ce12da51f41e7a8ec3feae4c5ed1024eb4f847e9c","source":{"kind":"arxiv","id":"2412.13663","version":2},"attestation_state":"computed","paper":{"title":"Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Alexis Gallagher, Antoine Chaffin, Benjamin Clavi\\'e, Benjamin Warner, Faisal Ladhak, Griffin Adams, Iacopo Poli, Jeremy Howard, Nathan Cooper, Orion Weller, Oskar Hallstr\\\"om, Raja Biswas, Said Taghadouini, Tom Aarsen","submitted_at":"2024-12-18T09:39:44Z","abstract_excerpt":"Encoder-only transformer models such as BERT offer a great performance-size tradeoff for retrieval and classification tasks with respect to larger decoder-only models. Despite being the workhorse of numerous production pipelines, there have been limited Pareto improvements to BERT since its release. In this paper, we introduce ModernBERT, bringing modern model optimizations to encoder-only models and representing a major Pareto improvement over older encoders. Trained on 2 trillion tokens with a native 8192 sequence length, ModernBERT models exhibit state-of-the-art results on a large pool of "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2412.13663","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2024-12-18T09:39:44Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"a35577cf6c1da11279590635ca749acdbaeee9b9f33fa12288491d2cb7ea194b","abstract_canon_sha256":"121af0e3691b4c802a9f1fdbe702c7a47889b22303e2daa21ed745bb5974bf2e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T17:34:42.676731Z","signature_b64":"m65RI2gAeknckADA/76sCIcOfGnbkS11yaf0x+EjQ1On1XdV9x6Jz9ehfoSz0Me6t8LI5kiiP1qf4f6uSHMADQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"dcbb8f03ab6f3aa77db5111ce12da51f41e7a8ec3feae4c5ed1024eb4f847e9c","last_reissued_at":"2026-05-20T17:34:42.673889Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T17:34:42.673889Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Alexis Gallagher, Antoine Chaffin, Benjamin Clavi\\'e, Benjamin Warner, Faisal Ladhak, Griffin Adams, Iacopo Poli, Jeremy Howard, Nathan Cooper, Orion Weller, Oskar Hallstr\\\"om, Raja Biswas, Said Taghadouini, Tom Aarsen","submitted_at":"2024-12-18T09:39:44Z","abstract_excerpt":"Encoder-only transformer models such as BERT offer a great performance-size tradeoff for retrieval and classification tasks with respect to larger decoder-only models. Despite being the workhorse of numerous production pipelines, there have been limited Pareto improvements to BERT since its release. In this paper, we introduce ModernBERT, bringing modern model optimizations to encoder-only models and representing a major Pareto improvement over older encoders. Trained on 2 trillion tokens with a native 8192 sequence length, ModernBERT models exhibit state-of-the-art results on a large pool of "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2412.13663","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2412.13663/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2412.13663","created_at":"2026-05-20T17:34:42.674028+00:00"},{"alias_kind":"arxiv_version","alias_value":"2412.13663v2","created_at":"2026-05-20T17:34:42.674028+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.13663","created_at":"2026-05-20T17:34:42.674028+00:00"},{"alias_kind":"pith_short_12","alias_value":"3S5Y6A5LN45K","created_at":"2026-05-20T17:34:42.674028+00:00"},{"alias_kind":"pith_short_16","alias_value":"3S5Y6A5LN45KO7NV","created_at":"2026-05-20T17:34:42.674028+00:00"},{"alias_kind":"pith_short_8","alias_value":"3S5Y6A5L","created_at":"2026-05-20T17:34:42.674028+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":22,"internal_anchor_count":22,"sample":[{"citing_arxiv_id":"2605.20713","citing_title":"SAVER: Selective As-Needed Vision Evidence for Multimodal Information Extraction","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16035","citing_title":"Who Owns This Agent? Tracing AI Agents Back to Their Owners","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17106","citing_title":"HyDRA: Hybrid Dynamic Routing Architecture for Heterogeneous LLM Pools","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17201","citing_title":"Filter-then-Verify: A Multiphase GNN and ModernBERT Framework for Social Engineering Detection in Email Networks","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19568","citing_title":"m3BERT: A Modern, Multi-lingual, Matryoshka Bidirectional Encoder","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16991","citing_title":"Response-free item difficulty modelling for multiple-choice items with fine-tuned transformers: Component-wise representation and multi-task learning","ref_index":182,"is_internal_anchor":true},{"citing_arxiv_id":"2505.20414","citing_title":"RetroMotion: Retrocausal Motion Forecasting Models are Instructable","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2507.00994","citing_title":"Should We Still Pretrain Encoders with Masked Language Modeling?","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2507.20993","citing_title":"Annotation-Assisted Learning of Treatment Policies From Multimodal Electronic Health Records","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2509.00798","citing_title":"Progressive Multimodal Search and Reasoning for Knowledge-Intensive Visual Question Answering","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2512.11108","citing_title":"Explanation Bias is a Product: Revealing the Hidden Lexical and Position Preferences in Post-Hoc Feature Attribution","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06193","citing_title":"Depression Detection at the Point of Care: Automated Analysis of Linguistic Signals from Routine Primary Care Encounters","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26483","citing_title":"Efficient Listwise Reranking with Compressed Document Representations","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08254","citing_title":"HyperTransport: Amortized Conditioning of T2I Generative Models","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23488","citing_title":"Do Synthetic Trajectories Reflect Real Reward Hacking? A Systematic Study on Monitoring In-the-Wild Hacking in Code Generation","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00086","citing_title":"NorBERTo: A ModernBERT Model Trained for Portuguese with 331 Billion Tokens Corpus","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2604.19921","citing_title":"Commonsense Knowledge with Negation: A Resource to Enhance Negation Understanding","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07985","citing_title":"Rag Performance Prediction for Question Answering","ref_index":41,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07622","citing_title":"Is She Even Relevant? When BERT Ignores Explicit Gender Cues","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07982","citing_title":"GLiGuard: Schema-Conditioned Classification for LLM Safeguard","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07554","citing_title":"ProteinJEPA: Latent prediction complements protein language models","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18603","citing_title":"Dual Triangle Attention: Effective Bidirectional Attention Without Positional Embeddings","ref_index":5,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/3S5Y6A5LN45KO7NVCEOOCLNFD5","json":"https://pith.science/pith/3S5Y6A5LN45KO7NVCEOOCLNFD5.json","graph_json":"https://pith.science/api/pith-number/3S5Y6A5LN45KO7NVCEOOCLNFD5/graph.json","events_json":"https://pith.science/api/pith-number/3S5Y6A5LN45KO7NVCEOOCLNFD5/events.json","paper":"https://pith.science/paper/3S5Y6A5L"},"agent_actions":{"view_html":"https://pith.science/pith/3S5Y6A5LN45KO7NVCEOOCLNFD5","download_json":"https://pith.science/pith/3S5Y6A5LN45KO7NVCEOOCLNFD5.json","view_paper":"https://pith.science/paper/3S5Y6A5L","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2412.13663&json=true","fetch_graph":"https://pith.science/api/pith-number/3S5Y6A5LN45KO7NVCEOOCLNFD5/graph.json","fetch_events":"https://pith.science/api/pith-number/3S5Y6A5LN45KO7NVCEOOCLNFD5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/3S5Y6A5LN45KO7NVCEOOCLNFD5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/3S5Y6A5LN45KO7NVCEOOCLNFD5/action/storage_attestation","attest_author":"https://pith.science/pith/3S5Y6A5LN45KO7NVCEOOCLNFD5/action/author_attestation","sign_citation":"https://pith.science/pith/3S5Y6A5LN45KO7NVCEOOCLNFD5/action/citation_signature","submit_replication":"https://pith.science/pith/3S5Y6A5LN45KO7NVCEOOCLNFD5/action/replication_record"}},"created_at":"2026-05-20T17:34:42.674028+00:00","updated_at":"2026-05-20T17:34:42.674028+00:00"}