{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:ZGLDJR3NRERIJGGU7PHVEPXXF6","short_pith_number":"pith:ZGLDJR3N","schema_version":"1.0","canonical_sha256":"c99634c76d89228498d4fbcf523ef72fad8a2f392460aa9edcc72ac4a9679ec8","source":{"kind":"arxiv","id":"2602.08064","version":2},"attestation_state":"computed","paper":{"title":"SiameseNorm: Breaking the Barrier to Reconciling Pre/Post-Norm","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Dongchen Han, Erchao Zhao, Gao Huang, Guanjun Jiang, Haofeng Huang, Mengyu Zhou, Ming Chen, Tianyu Li, Xiaoxi Jiang, Zixuan Cao","submitted_at":"2026-02-08T17:17:56Z","abstract_excerpt":"The long-standing tension between Pre- and Post-Norm remains an open problem in Transformer architecture, reflecting a fundamental trade-off between training stability and representational capacity. Prior attempts to combine their strengths have made progress, but often show limited robustness across training settings, restricting their broader applicability. We revisit this dilemma, showing that single-stream architectures struggle to reconcile Pre-Norm's stable identity-gradient propagation with Post-Norm's normalization of the main residual path. To address this structural tension, we propo"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2602.08064","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-08T17:17:56Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"78a6288759dd1d669a4df59bc92df5facc82d000e5543acea13bd88c46332ffe","abstract_canon_sha256":"3b48f511745918833486aff888624e6bc1ae6a29380d9259d574987fda36327f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T01:03:56.937733Z","signature_b64":"4P4Qc7D9Ss/HxVwEKsCF9k5HugkBl/XcrEAbr/LPrbI1H8GJcu8+qVpG7Z62kfP4KJhgzqsg3QxAK/kH+ZofCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c99634c76d89228498d4fbcf523ef72fad8a2f392460aa9edcc72ac4a9679ec8","last_reissued_at":"2026-05-22T01:03:56.936787Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T01:03:56.936787Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"SiameseNorm: Breaking the Barrier to Reconciling Pre/Post-Norm","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Dongchen Han, Erchao Zhao, Gao Huang, Guanjun Jiang, Haofeng Huang, Mengyu Zhou, Ming Chen, Tianyu Li, Xiaoxi Jiang, Zixuan Cao","submitted_at":"2026-02-08T17:17:56Z","abstract_excerpt":"The long-standing tension between Pre- and Post-Norm remains an open problem in Transformer architecture, reflecting a fundamental trade-off between training stability and representational capacity. Prior attempts to combine their strengths have made progress, but often show limited robustness across training settings, restricting their broader applicability. We revisit this dilemma, showing that single-stream architectures struggle to reconcile Pre-Norm's stable identity-gradient propagation with Post-Norm's normalization of the main residual path. To address this structural tension, we propo"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.08064","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.08064/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2602.08064","created_at":"2026-05-22T01:03:56.936915+00:00"},{"alias_kind":"arxiv_version","alias_value":"2602.08064v2","created_at":"2026-05-22T01:03:56.936915+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.08064","created_at":"2026-05-22T01:03:56.936915+00:00"},{"alias_kind":"pith_short_12","alias_value":"ZGLDJR3NRERI","created_at":"2026-05-22T01:03:56.936915+00:00"},{"alias_kind":"pith_short_16","alias_value":"ZGLDJR3NRERIJGGU","created_at":"2026-05-22T01:03:56.936915+00:00"},{"alias_kind":"pith_short_8","alias_value":"ZGLDJR3N","created_at":"2026-05-22T01:03:56.936915+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2605.23259","citing_title":"Multi-Gate Residuals","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12374","citing_title":"Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2603.15031","citing_title":"Attention Residuals","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20708","citing_title":"Rethinking Cross-Layer Information Routing in Diffusion Transformers","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12374","citing_title":"Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12374","citing_title":"Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models","ref_index":23,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ZGLDJR3NRERIJGGU7PHVEPXXF6","json":"https://pith.science/pith/ZGLDJR3NRERIJGGU7PHVEPXXF6.json","graph_json":"https://pith.science/api/pith-number/ZGLDJR3NRERIJGGU7PHVEPXXF6/graph.json","events_json":"https://pith.science/api/pith-number/ZGLDJR3NRERIJGGU7PHVEPXXF6/events.json","paper":"https://pith.science/paper/ZGLDJR3N"},"agent_actions":{"view_html":"https://pith.science/pith/ZGLDJR3NRERIJGGU7PHVEPXXF6","download_json":"https://pith.science/pith/ZGLDJR3NRERIJGGU7PHVEPXXF6.json","view_paper":"https://pith.science/paper/ZGLDJR3N","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2602.08064&json=true","fetch_graph":"https://pith.science/api/pith-number/ZGLDJR3NRERIJGGU7PHVEPXXF6/graph.json","fetch_events":"https://pith.science/api/pith-number/ZGLDJR3NRERIJGGU7PHVEPXXF6/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ZGLDJR3NRERIJGGU7PHVEPXXF6/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ZGLDJR3NRERIJGGU7PHVEPXXF6/action/storage_attestation","attest_author":"https://pith.science/pith/ZGLDJR3NRERIJGGU7PHVEPXXF6/action/author_attestation","sign_citation":"https://pith.science/pith/ZGLDJR3NRERIJGGU7PHVEPXXF6/action/citation_signature","submit_replication":"https://pith.science/pith/ZGLDJR3NRERIJGGU7PHVEPXXF6/action/replication_record"}},"created_at":"2026-05-22T01:03:56.936915+00:00","updated_at":"2026-05-22T01:03:56.936915+00:00"}