{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:YAGCUA4C52ZIQ3M6SPCT4XGGQS","short_pith_number":"pith:YAGCUA4C","schema_version":"1.0","canonical_sha256":"c00c2a0382eeb2886d9e93c53e5cc6849a9efce1b1d8f746d50273af765c8d7c","source":{"kind":"arxiv","id":"2602.20433","version":2},"attestation_state":"computed","paper":{"title":"Disentangling Geometry, Performance, and Training in Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Arjun Subramonian, Atharva Kulkarni, Jacob Mitchell Springer, Swabha Swayamdipta","submitted_at":"2026-02-24T00:31:04Z","abstract_excerpt":"Geometric properties of Transformer weights, particularly the unembedding matrix, have been widely useful in language model interpretability research. Yet, their utility for estimating downstream performance remains unclear. In this work, we systematically investigate the relationship between model performance and the unembedding matrix geometry, particularly its effective rank. Our experiments, involving a suite of 108 OLMo-style language models trained under controlled variation, reveal several key findings. While the best-performing models often exhibit a high effective rank, this trend is "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2602.20433","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-02-24T00:31:04Z","cross_cats_sorted":[],"title_canon_sha256":"2587fa4a8d636835ac7e322a8586f0787da0e84c6eb827603aec704f57496908","abstract_canon_sha256":"3676cf324dcfa0326af661c5000ad233623276a532b4eae21ae7a6b348fd656b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T02:13:21.406672Z","signature_b64":"f4DSXVBKGWTY6gPYlVMDY04hVXe6YFhehW7Va1wABPXxLzJEM2Bf/ojISwMO7DoTuj2LWZI2LklGf3hWZVlGDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c00c2a0382eeb2886d9e93c53e5cc6849a9efce1b1d8f746d50273af765c8d7c","last_reissued_at":"2026-06-23T02:13:21.406244Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T02:13:21.406244Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Disentangling Geometry, Performance, and Training in Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Arjun Subramonian, Atharva Kulkarni, Jacob Mitchell Springer, Swabha Swayamdipta","submitted_at":"2026-02-24T00:31:04Z","abstract_excerpt":"Geometric properties of Transformer weights, particularly the unembedding matrix, have been widely useful in language model interpretability research. Yet, their utility for estimating downstream performance remains unclear. In this work, we systematically investigate the relationship between model performance and the unembedding matrix geometry, particularly its effective rank. Our experiments, involving a suite of 108 OLMo-style language models trained under controlled variation, reveal several key findings. While the best-performing models often exhibit a high effective rank, this trend is "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.20433","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.20433/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2602.20433","created_at":"2026-06-23T02:13:21.406297+00:00"},{"alias_kind":"arxiv_version","alias_value":"2602.20433v2","created_at":"2026-06-23T02:13:21.406297+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.20433","created_at":"2026-06-23T02:13:21.406297+00:00"},{"alias_kind":"pith_short_12","alias_value":"YAGCUA4C52ZI","created_at":"2026-06-23T02:13:21.406297+00:00"},{"alias_kind":"pith_short_16","alias_value":"YAGCUA4C52ZIQ3M6","created_at":"2026-06-23T02:13:21.406297+00:00"},{"alias_kind":"pith_short_8","alias_value":"YAGCUA4C","created_at":"2026-06-23T02:13:21.406297+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2606.06418","citing_title":"Double Preconditioning (DoPr): Optimization for Test-Time Performance, not Validation Loss","ref_index":192,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17084","citing_title":"Scale Determines Whether Language Models Organize Representation Geometry for Prediction","ref_index":4,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YAGCUA4C52ZIQ3M6SPCT4XGGQS","json":"https://pith.science/pith/YAGCUA4C52ZIQ3M6SPCT4XGGQS.json","graph_json":"https://pith.science/api/pith-number/YAGCUA4C52ZIQ3M6SPCT4XGGQS/graph.json","events_json":"https://pith.science/api/pith-number/YAGCUA4C52ZIQ3M6SPCT4XGGQS/events.json","paper":"https://pith.science/paper/YAGCUA4C"},"agent_actions":{"view_html":"https://pith.science/pith/YAGCUA4C52ZIQ3M6SPCT4XGGQS","download_json":"https://pith.science/pith/YAGCUA4C52ZIQ3M6SPCT4XGGQS.json","view_paper":"https://pith.science/paper/YAGCUA4C","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2602.20433&json=true","fetch_graph":"https://pith.science/api/pith-number/YAGCUA4C52ZIQ3M6SPCT4XGGQS/graph.json","fetch_events":"https://pith.science/api/pith-number/YAGCUA4C52ZIQ3M6SPCT4XGGQS/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YAGCUA4C52ZIQ3M6SPCT4XGGQS/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YAGCUA4C52ZIQ3M6SPCT4XGGQS/action/storage_attestation","attest_author":"https://pith.science/pith/YAGCUA4C52ZIQ3M6SPCT4XGGQS/action/author_attestation","sign_citation":"https://pith.science/pith/YAGCUA4C52ZIQ3M6SPCT4XGGQS/action/citation_signature","submit_replication":"https://pith.science/pith/YAGCUA4C52ZIQ3M6SPCT4XGGQS/action/replication_record"}},"created_at":"2026-06-23T02:13:21.406297+00:00","updated_at":"2026-06-23T02:13:21.406297+00:00"}