{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2022:6CCDXNMFOZ575PMU72ZXHZKRJJ","short_pith_number":"pith:6CCDXNMF","schema_version":"1.0","canonical_sha256":"f0843bb585767bfebd94feb373e5514a732461826e7ef46c7de447bdeece5cfe","source":{"kind":"arxiv","id":"2202.08625","version":1},"attestation_state":"computed","paper":{"title":"Revisiting Over-smoothing in BERT from the Perspective of Graph","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Hang Xu, Han Shi, James T. Kwok, Jiahui Gao, Lingpeng Kong, Stephen M.S. Lee, Xiaodan Liang, Zhenguo Li","submitted_at":"2022-02-17T12:20:52Z","abstract_excerpt":"Recently over-smoothing phenomenon of Transformer-based models is observed in both vision and language fields. However, no existing work has delved deeper to further investigate the main cause of this phenomenon. In this work, we make the attempt to analyze the over-smoothing problem from the perspective of graph, where such problem was first discovered and explored. Intuitively, the self-attention matrix can be seen as a normalized adjacent matrix of a corresponding graph. Based on the above connection, we provide some theoretical analysis and find that layer normalization plays a key role in"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2202.08625","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2022-02-17T12:20:52Z","cross_cats_sorted":[],"title_canon_sha256":"d5758defa95a52c214bd3660a81ea0e7391854e1ff4bd196d59957a3cbb9a83e","abstract_canon_sha256":"34dbf26e86e8ee3e17bb0e0ba9b349ffd1184006a4dbedb41123ebd98d60483d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T03:57:56.278651Z","signature_b64":"0wSYxGQM9Mho/ASZr7rMfFekrdvDBBJor94FwqDNN3IPqhT5vhIHlcMDjTqImWj4uyPThcqmZRZTVjakw5/YDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f0843bb585767bfebd94feb373e5514a732461826e7ef46c7de447bdeece5cfe","last_reissued_at":"2026-07-05T03:57:56.278161Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T03:57:56.278161Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Revisiting Over-smoothing in BERT from the Perspective of Graph","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Hang Xu, Han Shi, James T. Kwok, Jiahui Gao, Lingpeng Kong, Stephen M.S. Lee, Xiaodan Liang, Zhenguo Li","submitted_at":"2022-02-17T12:20:52Z","abstract_excerpt":"Recently over-smoothing phenomenon of Transformer-based models is observed in both vision and language fields. However, no existing work has delved deeper to further investigate the main cause of this phenomenon. In this work, we make the attempt to analyze the over-smoothing problem from the perspective of graph, where such problem was first discovered and explored. Intuitively, the self-attention matrix can be seen as a normalized adjacent matrix of a corresponding graph. Based on the above connection, we provide some theoretical analysis and find that layer normalization plays a key role in"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2202.08625","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2202.08625/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2202.08625","created_at":"2026-07-05T03:57:56.278221+00:00"},{"alias_kind":"arxiv_version","alias_value":"2202.08625v1","created_at":"2026-07-05T03:57:56.278221+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2202.08625","created_at":"2026-07-05T03:57:56.278221+00:00"},{"alias_kind":"pith_short_12","alias_value":"6CCDXNMFOZ57","created_at":"2026-07-05T03:57:56.278221+00:00"},{"alias_kind":"pith_short_16","alias_value":"6CCDXNMFOZ575PMU","created_at":"2026-07-05T03:57:56.278221+00:00"},{"alias_kind":"pith_short_8","alias_value":"6CCDXNMF","created_at":"2026-07-05T03:57:56.278221+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2605.25619","citing_title":"Analogies between Transformer Layers and Power Method","ref_index":30,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/6CCDXNMFOZ575PMU72ZXHZKRJJ","json":"https://pith.science/pith/6CCDXNMFOZ575PMU72ZXHZKRJJ.json","graph_json":"https://pith.science/api/pith-number/6CCDXNMFOZ575PMU72ZXHZKRJJ/graph.json","events_json":"https://pith.science/api/pith-number/6CCDXNMFOZ575PMU72ZXHZKRJJ/events.json","paper":"https://pith.science/paper/6CCDXNMF"},"agent_actions":{"view_html":"https://pith.science/pith/6CCDXNMFOZ575PMU72ZXHZKRJJ","download_json":"https://pith.science/pith/6CCDXNMFOZ575PMU72ZXHZKRJJ.json","view_paper":"https://pith.science/paper/6CCDXNMF","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2202.08625&json=true","fetch_graph":"https://pith.science/api/pith-number/6CCDXNMFOZ575PMU72ZXHZKRJJ/graph.json","fetch_events":"https://pith.science/api/pith-number/6CCDXNMFOZ575PMU72ZXHZKRJJ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/6CCDXNMFOZ575PMU72ZXHZKRJJ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/6CCDXNMFOZ575PMU72ZXHZKRJJ/action/storage_attestation","attest_author":"https://pith.science/pith/6CCDXNMFOZ575PMU72ZXHZKRJJ/action/author_attestation","sign_citation":"https://pith.science/pith/6CCDXNMFOZ575PMU72ZXHZKRJJ/action/citation_signature","submit_replication":"https://pith.science/pith/6CCDXNMFOZ575PMU72ZXHZKRJJ/action/replication_record"}},"created_at":"2026-07-05T03:57:56.278221+00:00","updated_at":"2026-07-05T03:57:56.278221+00:00"}