{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2021:C5W5P6KVGEHNQOJOI7SI2L27Z2","short_pith_number":"pith:C5W5P6KV","canonical_record":{"source":{"id":"2111.09543","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2021-11-18T06:48:00Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"bbfea01373de9b0c2623a49541e84f761d6cab8f9bc1eb766acce36e5188d917","abstract_canon_sha256":"f35e93f6ffbc86590f62ab257c29e8323fbfd90db0896f8a2e22f75307f307a9"},"schema_version":"1.0"},"canonical_sha256":"176dd7f955310ed8392e47e48d2f5fceb0214103eb77daf9ae84779d57b047bb","source":{"kind":"arxiv","id":"2111.09543","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2111.09543","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"arxiv_version","alias_value":"2111.09543v4","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2111.09543","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"pith_short_12","alias_value":"C5W5P6KVGEHN","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"C5W5P6KVGEHNQOJO","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"C5W5P6KV","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2021:C5W5P6KVGEHNQOJOI7SI2L27Z2","target":"record","payload":{"canonical_record":{"source":{"id":"2111.09543","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2021-11-18T06:48:00Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"bbfea01373de9b0c2623a49541e84f761d6cab8f9bc1eb766acce36e5188d917","abstract_canon_sha256":"f35e93f6ffbc86590f62ab257c29e8323fbfd90db0896f8a2e22f75307f307a9"},"schema_version":"1.0"},"canonical_sha256":"176dd7f955310ed8392e47e48d2f5fceb0214103eb77daf9ae84779d57b047bb","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:52.519522Z","signature_b64":"NvP0XxtDFkUQcBwuakGHvwkccM4Yb0zWxneIahS3EthmM3lRVJpVlUi5QcfdJ91K6dQexXfipyx0AGCSDeydCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"176dd7f955310ed8392e47e48d2f5fceb0214103eb77daf9ae84779d57b047bb","last_reissued_at":"2026-05-17T23:38:52.519089Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:52.519089Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2111.09543","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:52Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"DPxzcyLYH9T42py29DphNNYyozk3dHuaLMX0WCaSRUQlJAyXQ6M0icNIHOT37tINIthbAZikp79rGZj3BazJCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T10:32:46.366078Z"},"content_sha256":"b3e76000cc6801c1712998907d8696c0a6273eb9cde2bdd1888dea27d2d3cb30","schema_version":"1.0","event_id":"sha256:b3e76000cc6801c1712998907d8696c0a6273eb9cde2bdd1888dea27d2d3cb30"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2021:C5W5P6KVGEHNQOJOI7SI2L27Z2","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing","license":"http://creativecommons.org/licenses/by/4.0/","headline":"DeBERTaV3 replaces masked language modeling with replaced token detection and introduces gradient-disentangled embedding sharing to raise accuracy on natural language understanding benchmarks.","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Jianfeng Gao, Pengcheng He, Weizhu Chen","submitted_at":"2021-11-18T06:48:00Z","abstract_excerpt":"This paper presents a new pre-trained language model, DeBERTaV3, which improves the original DeBERTa model by replacing mask language modeling (MLM) with replaced token detection (RTD), a more sample-efficient pre-training task. Our analysis shows that vanilla embedding sharing in ELECTRA hurts training efficiency and model performance. This is because the training losses of the discriminator and the generator pull token embeddings in different directions, creating the \"tug-of-war\" dynamics. We thus propose a new gradient-disentangled embedding sharing method that avoids the tug-of-war dynamic"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"the DeBERTaV3 Large model achieves a 91.37% average score, which is 1.37% over DeBERTa and 1.91% over ELECTRA, setting a new state-of-the-art (SOTA) among the models with a similar structure.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the observed gains come from the gradient-disentangled sharing rather than from other unstated differences in training schedule, data order, or hyper-parameters between the new runs and the cited DeBERTa/ELECTRA baselines.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"DeBERTaV3 improves DeBERTa by switching to replaced token detection pre-training and using gradient-disentangled embedding sharing, reaching 91.37% on GLUE and new SOTA on XNLI zero-shot.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"DeBERTaV3 replaces masked language modeling with replaced token detection and introduces gradient-disentangled embedding sharing to raise accuracy on natural language understanding benchmarks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"0a8765f68352b0095408459e810ab2b5ffc2c46cb226296ff4c7027976b0144e"},"source":{"id":"2111.09543","kind":"arxiv","version":4},"verdict":{"id":"16f8b19b-50a4-4a5e-9eab-860559247bc0","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T12:43:34.120976Z","strongest_claim":"the DeBERTaV3 Large model achieves a 91.37% average score, which is 1.37% over DeBERTa and 1.91% over ELECTRA, setting a new state-of-the-art (SOTA) among the models with a similar structure.","one_line_summary":"DeBERTaV3 improves DeBERTa by switching to replaced token detection pre-training and using gradient-disentangled embedding sharing, reaching 91.37% on GLUE and new SOTA on XNLI zero-shot.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the observed gains come from the gradient-disentangled sharing rather than from other unstated differences in training schedule, data order, or hyper-parameters between the new runs and the cited DeBERTa/ELECTRA baselines.","pith_extraction_headline":"DeBERTaV3 replaces masked language modeling with replaced token detection and introduces gradient-disentangled embedding sharing to raise accuracy on natural language understanding benchmarks."},"references":{"count":29,"sample":[{"doi":"","year":2005,"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","ref_index":1,"cited_arxiv_id":"2005.14165","is_internal_anchor":true},{"doi":"","year":2017,"title":"Semeval-2017 task 1: Semantic textual similarity-multilingual and cross-lingual focused evaluation","work_id":"b884e540-64cb-4ba2-8bc3-ed507116ef2c","ref_index":2,"cited_arxiv_id":"1708.00055","is_internal_anchor":true},{"doi":"","year":null,"title":"Xlm-e: Cross-lingual language model pre-training via electra","work_id":"83df32e3-23a3-45d6-9309-83e4754d6e56","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2018,"title":"Xnli: Evaluating cross-lingual sentence representations","work_id":"86645862-6fcc-432e-9362-8e78c10b2759","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2019,"title":"Bert: Pre-training of deep bidirectional transformers for language understanding","work_id":"693b70ad-3022-4615-938e-7752341ec181","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":29,"snapshot_sha256":"172f439b9a71b3472d6222cb44401148cebbf1f3bd095dc20ffb75f65564a805","internal_anchors":8},"formal_canon":{"evidence_count":2,"snapshot_sha256":"a8e9a39262aa431ab0f61a93a5641b8064b9fae81cfb5ed3bada3b28c0de223e"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"16f8b19b-50a4-4a5e-9eab-860559247bc0"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:52Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"rEa89+6aqv0G1PdMirt1e00gBHF3oR5gQZjPjr7Lb/+N0olw3te9J7MvUQTu/cBXNqGZJgCaiZJYf77krYIMDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T10:32:46.366652Z"},"content_sha256":"eb6ce148f14d31853326b3c6f3e534fbcab0ec0d21d72db466efbb9b909a660a","schema_version":"1.0","event_id":"sha256:eb6ce148f14d31853326b3c6f3e534fbcab0ec0d21d72db466efbb9b909a660a"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/C5W5P6KVGEHNQOJOI7SI2L27Z2/bundle.json","state_url":"https://pith.science/pith/C5W5P6KVGEHNQOJOI7SI2L27Z2/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/C5W5P6KVGEHNQOJOI7SI2L27Z2/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-10T10:32:46Z","links":{"resolver":"https://pith.science/pith/C5W5P6KVGEHNQOJOI7SI2L27Z2","bundle":"https://pith.science/pith/C5W5P6KVGEHNQOJOI7SI2L27Z2/bundle.json","state":"https://pith.science/pith/C5W5P6KVGEHNQOJOI7SI2L27Z2/state.json","well_known_bundle":"https://pith.science/.well-known/pith/C5W5P6KVGEHNQOJOI7SI2L27Z2/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2021:C5W5P6KVGEHNQOJOI7SI2L27Z2","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f35e93f6ffbc86590f62ab257c29e8323fbfd90db0896f8a2e22f75307f307a9","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2021-11-18T06:48:00Z","title_canon_sha256":"bbfea01373de9b0c2623a49541e84f761d6cab8f9bc1eb766acce36e5188d917"},"schema_version":"1.0","source":{"id":"2111.09543","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2111.09543","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"arxiv_version","alias_value":"2111.09543v4","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2111.09543","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"pith_short_12","alias_value":"C5W5P6KVGEHN","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"C5W5P6KVGEHNQOJO","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"C5W5P6KV","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:eb6ce148f14d31853326b3c6f3e534fbcab0ec0d21d72db466efbb9b909a660a","target":"graph","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"the DeBERTaV3 Large model achieves a 91.37% average score, which is 1.37% over DeBERTa and 1.91% over ELECTRA, setting a new state-of-the-art (SOTA) among the models with a similar structure."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the observed gains come from the gradient-disentangled sharing rather than from other unstated differences in training schedule, data order, or hyper-parameters between the new runs and the cited DeBERTa/ELECTRA baselines."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"DeBERTaV3 improves DeBERTa by switching to replaced token detection pre-training and using gradient-disentangled embedding sharing, reaching 91.37% on GLUE and new SOTA on XNLI zero-shot."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"DeBERTaV3 replaces masked language modeling with replaced token detection and introduces gradient-disentangled embedding sharing to raise accuracy on natural language understanding benchmarks."}],"snapshot_sha256":"0a8765f68352b0095408459e810ab2b5ffc2c46cb226296ff4c7027976b0144e"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"a8e9a39262aa431ab0f61a93a5641b8064b9fae81cfb5ed3bada3b28c0de223e"},"paper":{"abstract_excerpt":"This paper presents a new pre-trained language model, DeBERTaV3, which improves the original DeBERTa model by replacing mask language modeling (MLM) with replaced token detection (RTD), a more sample-efficient pre-training task. Our analysis shows that vanilla embedding sharing in ELECTRA hurts training efficiency and model performance. This is because the training losses of the discriminator and the generator pull token embeddings in different directions, creating the \"tug-of-war\" dynamics. We thus propose a new gradient-disentangled embedding sharing method that avoids the tug-of-war dynamic","authors_text":"Jianfeng Gao, Pengcheng He, Weizhu Chen","cross_cats":["cs.LG"],"headline":"DeBERTaV3 replaces masked language modeling with replaced token detection and introduces gradient-disentangled embedding sharing to raise accuracy on natural language understanding benchmarks.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2021-11-18T06:48:00Z","title":"DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing"},"references":{"count":29,"internal_anchors":8,"resolved_work":29,"sample":[{"cited_arxiv_id":"2005.14165","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","year":2005},{"cited_arxiv_id":"1708.00055","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Semeval-2017 task 1: Semantic textual similarity-multilingual and cross-lingual focused evaluation","work_id":"b884e540-64cb-4ba2-8bc3-ed507116ef2c","year":2017},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Xlm-e: Cross-lingual language model pre-training via electra","work_id":"83df32e3-23a3-45d6-9309-83e4754d6e56","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Xnli: Evaluating cross-lingual sentence representations","work_id":"86645862-6fcc-432e-9362-8e78c10b2759","year":2018},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Bert: Pre-training of deep bidirectional transformers for language understanding","work_id":"693b70ad-3022-4615-938e-7752341ec181","year":2019}],"snapshot_sha256":"172f439b9a71b3472d6222cb44401148cebbf1f3bd095dc20ffb75f65564a805"},"source":{"id":"2111.09543","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-15T12:43:34.120976Z","id":"16f8b19b-50a4-4a5e-9eab-860559247bc0","model_set":{"reader":"grok-4.3"},"one_line_summary":"DeBERTaV3 improves DeBERTa by switching to replaced token detection pre-training and using gradient-disentangled embedding sharing, reaching 91.37% on GLUE and new SOTA on XNLI zero-shot.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"DeBERTaV3 replaces masked language modeling with replaced token detection and introduces gradient-disentangled embedding sharing to raise accuracy on natural language understanding benchmarks.","strongest_claim":"the DeBERTaV3 Large model achieves a 91.37% average score, which is 1.37% over DeBERTa and 1.91% over ELECTRA, setting a new state-of-the-art (SOTA) among the models with a similar structure.","weakest_assumption":"That the observed gains come from the gradient-disentangled sharing rather than from other unstated differences in training schedule, data order, or hyper-parameters between the new runs and the cited DeBERTa/ELECTRA baselines."}},"verdict_id":"16f8b19b-50a4-4a5e-9eab-860559247bc0"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:b3e76000cc6801c1712998907d8696c0a6273eb9cde2bdd1888dea27d2d3cb30","target":"record","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f35e93f6ffbc86590f62ab257c29e8323fbfd90db0896f8a2e22f75307f307a9","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2021-11-18T06:48:00Z","title_canon_sha256":"bbfea01373de9b0c2623a49541e84f761d6cab8f9bc1eb766acce36e5188d917"},"schema_version":"1.0","source":{"id":"2111.09543","kind":"arxiv","version":4}},"canonical_sha256":"176dd7f955310ed8392e47e48d2f5fceb0214103eb77daf9ae84779d57b047bb","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"176dd7f955310ed8392e47e48d2f5fceb0214103eb77daf9ae84779d57b047bb","first_computed_at":"2026-05-17T23:38:52.519089Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:52.519089Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"NvP0XxtDFkUQcBwuakGHvwkccM4Yb0zWxneIahS3EthmM3lRVJpVlUi5QcfdJ91K6dQexXfipyx0AGCSDeydCg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:52.519522Z","signed_message":"canonical_sha256_bytes"},"source_id":"2111.09543","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:b3e76000cc6801c1712998907d8696c0a6273eb9cde2bdd1888dea27d2d3cb30","sha256:eb6ce148f14d31853326b3c6f3e534fbcab0ec0d21d72db466efbb9b909a660a"],"state_sha256":"07f93fa267330b7c7f63ca201d50b11c8f43a223d4ba471b9886c1fdb7c47fa3"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"p1kua4AnuA75jz378i4hrM/UJyIf08eZBdm/qe4nzcvuVLZPQMU6YdDBXEaf7abI4H6Tyc7+ksbbLr/R5lkPBQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-10T10:32:46.370067Z","bundle_sha256":"617c05350250710811080f98797939c5efa22129f9685309128e6a7c16874097"}}