{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:I477SBLGD5WLDCP7DL657LGZJS","short_pith_number":"pith:I477SBLG","schema_version":"1.0","canonical_sha256":"473ff905661f6cb189ff1afddfacd94c90679239ab50599372629ea24b998857","source":{"kind":"arxiv","id":"2505.24333","version":3},"attestation_state":"computed","paper":{"title":"Two failure modes of deep transformers and how to avoid them: a unified theory of signal propagation at initialisation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cond-mat.dis-nn","cond-mat.stat-mech","cs.LG"],"primary_cat":"stat.ML","authors_text":"Alessio Giorlandino, Sebastian Goldt","submitted_at":"2025-05-30T08:18:23Z","abstract_excerpt":"Finding the right initialisation for neural networks is crucial to ensure smooth training and good performance. In transformers, the wrong initialisation can lead to one of two failure modes of self-attention layers: rank collapse, where all tokens collapse into similar representations, and entropy collapse, where highly concentrated attention scores lead to training instability. While previous work has studied different scaling regimes for transformers, an asymptotically exact, down-to-the constant prescription for how to initialise transformers has so far been lacking. Here, we provide an an"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2505.24333","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"stat.ML","submitted_at":"2025-05-30T08:18:23Z","cross_cats_sorted":["cond-mat.dis-nn","cond-mat.stat-mech","cs.LG"],"title_canon_sha256":"8c3d8cf4b8d5a38546996c381e7822e264f761782815222187c09c67a15e79bb","abstract_canon_sha256":"36c57741ef6dd962bdc9a2890666cd461274c58643108316d8c314a8ded05717"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T02:04:48.958433Z","signature_b64":"0EVVKSM9I0gaDEOpVkkwyV0Xo6s/ZfSup80K/QpRCMFMk+o6VhJoFCOhzME5inZ4iRI6C+AjjoO8y0TmYkCnBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"473ff905661f6cb189ff1afddfacd94c90679239ab50599372629ea24b998857","last_reissued_at":"2026-05-21T02:04:48.957473Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T02:04:48.957473Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Two failure modes of deep transformers and how to avoid them: a unified theory of signal propagation at initialisation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cond-mat.dis-nn","cond-mat.stat-mech","cs.LG"],"primary_cat":"stat.ML","authors_text":"Alessio Giorlandino, Sebastian Goldt","submitted_at":"2025-05-30T08:18:23Z","abstract_excerpt":"Finding the right initialisation for neural networks is crucial to ensure smooth training and good performance. In transformers, the wrong initialisation can lead to one of two failure modes of self-attention layers: rank collapse, where all tokens collapse into similar representations, and entropy collapse, where highly concentrated attention scores lead to training instability. While previous work has studied different scaling regimes for transformers, an asymptotically exact, down-to-the constant prescription for how to initialise transformers has so far been lacking. Here, we provide an an"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2505.24333","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2505.24333/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2505.24333","created_at":"2026-05-21T02:04:48.957604+00:00"},{"alias_kind":"arxiv_version","alias_value":"2505.24333v3","created_at":"2026-05-21T02:04:48.957604+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2505.24333","created_at":"2026-05-21T02:04:48.957604+00:00"},{"alias_kind":"pith_short_12","alias_value":"I477SBLGD5WL","created_at":"2026-05-21T02:04:48.957604+00:00"},{"alias_kind":"pith_short_16","alias_value":"I477SBLGD5WLDCP7","created_at":"2026-05-21T02:04:48.957604+00:00"},{"alias_kind":"pith_short_8","alias_value":"I477SBLG","created_at":"2026-05-21T02:04:48.957604+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"2601.21366","citing_title":"Perceptrons and localization of attention's mean-field landscape","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12697","citing_title":"A Unified Framework for Critical Scaling of Inverse Temperature in Self-Attention","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08453","citing_title":"Sink vs. diagonal patterns as mechanisms for attention switch and oversmoothing prevention","ref_index":11,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/I477SBLGD5WLDCP7DL657LGZJS","json":"https://pith.science/pith/I477SBLGD5WLDCP7DL657LGZJS.json","graph_json":"https://pith.science/api/pith-number/I477SBLGD5WLDCP7DL657LGZJS/graph.json","events_json":"https://pith.science/api/pith-number/I477SBLGD5WLDCP7DL657LGZJS/events.json","paper":"https://pith.science/paper/I477SBLG"},"agent_actions":{"view_html":"https://pith.science/pith/I477SBLGD5WLDCP7DL657LGZJS","download_json":"https://pith.science/pith/I477SBLGD5WLDCP7DL657LGZJS.json","view_paper":"https://pith.science/paper/I477SBLG","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2505.24333&json=true","fetch_graph":"https://pith.science/api/pith-number/I477SBLGD5WLDCP7DL657LGZJS/graph.json","fetch_events":"https://pith.science/api/pith-number/I477SBLGD5WLDCP7DL657LGZJS/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/I477SBLGD5WLDCP7DL657LGZJS/action/timestamp_anchor","attest_storage":"https://pith.science/pith/I477SBLGD5WLDCP7DL657LGZJS/action/storage_attestation","attest_author":"https://pith.science/pith/I477SBLGD5WLDCP7DL657LGZJS/action/author_attestation","sign_citation":"https://pith.science/pith/I477SBLGD5WLDCP7DL657LGZJS/action/citation_signature","submit_replication":"https://pith.science/pith/I477SBLGD5WLDCP7DL657LGZJS/action/replication_record"}},"created_at":"2026-05-21T02:04:48.957604+00:00","updated_at":"2026-05-21T02:04:48.957604+00:00"}