{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:I3CPQ63NKWO2XCMKN6PTMHPTIJ","short_pith_number":"pith:I3CPQ63N","canonical_record":{"source":{"id":"2603.13381","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-11T03:13:10Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"c078383ab9c6950b4a8b2163a0b527fe12d53317c4b9abde170fb63f272e271d","abstract_canon_sha256":"08ed397f88febbe07b6a0ffb9f291c4479b617a901fa0ccc23215fbf80cc6831"},"schema_version":"1.0"},"canonical_sha256":"46c4f87b6d559dab898a6f9f361df3427f32d746d42c902acd9afcbbdb567555","source":{"kind":"arxiv","id":"2603.13381","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.13381","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"arxiv_version","alias_value":"2603.13381v3","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.13381","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_12","alias_value":"I3CPQ63NKWO2","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_16","alias_value":"I3CPQ63NKWO2XCMK","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_8","alias_value":"I3CPQ63N","created_at":"2026-05-27T01:04:57Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:I3CPQ63NKWO2XCMKN6PTMHPTIJ","target":"record","payload":{"canonical_record":{"source":{"id":"2603.13381","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-11T03:13:10Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"c078383ab9c6950b4a8b2163a0b527fe12d53317c4b9abde170fb63f272e271d","abstract_canon_sha256":"08ed397f88febbe07b6a0ffb9f291c4479b617a901fa0ccc23215fbf80cc6831"},"schema_version":"1.0"},"canonical_sha256":"46c4f87b6d559dab898a6f9f361df3427f32d746d42c902acd9afcbbdb567555","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T01:04:57.216108Z","signature_b64":"gxLEpGj2G7tg883sdqDfmVSmouYwWN2dN5dKowmClx2Y8EIlMgECFpiaiBi5lj6eB71Gmz7/o4qwK918obNGDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"46c4f87b6d559dab898a6f9f361df3427f32d746d42c902acd9afcbbdb567555","last_reissued_at":"2026-05-27T01:04:57.215379Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T01:04:57.215379Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2603.13381","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T01:04:57Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"vFeucmwaTYbuXNQ86I2lMXtm3VAcApYQYGVn9odILrNPdjcvLmJbMHbTySIfufHp2+ptXqJxquyPGlX6UBpzDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T19:21:46.412016Z"},"content_sha256":"7af39d1909329a34067e6ef67c365672822c78e67a00ece1603f9a772dcb9eeb","schema_version":"1.0","event_id":"sha256:7af39d1909329a34067e6ef67c365672822c78e67a00ece1603f9a772dcb9eeb"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:I3CPQ63NKWO2XCMKN6PTMHPTIJ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Beyond Linearity in Attention Projections: The Case for Nonlinear Queries","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Replacing the linear query projection with identity plus a small bottleneck MLP improves validation log-loss by 2.4 percent in GPT-style models.","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Marko Karbevski","submitted_at":"2026-03-11T03:13:10Z","abstract_excerpt":"Recent algebraic analysis shows that in decoder-only and encoder-only transformers, the Query projection $W_Q$ may be set to identity without noticeable performance deterioration. This is possible because attention depends on $X$ only through the products $XW_Q, XW_K, XW_V$, allowing basis transformations to be absorbed by adjacent layers and propagated through the network. We replace $W_Q \\in \\R^{d \\times d}$ with a nonlinear residual of the form $Q(X) = X + f_\\theta(X)$, where $f_\\theta$ is a bottleneck MLP with $d^2 + O(d)$ parameters. The identity term anchors the nonlinearity to a known-g"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experiments on GPT-3 small style models show consistent improvement over the baseline (2.40% lower validation log-loss, 6.81% lower perplexity), comfortably outperforming a model with 12.5% more non-embedding parameters.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the basis transformations absorbed by adjacent layers remain stable when the query projection is made nonlinear and that the small MLP does not introduce optimization instabilities at the tested scale.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Nonlinear query projections of the form X + MLP(X) improve transformer performance on small models with only d² + O(d) added parameters.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Replacing the linear query projection with identity plus a small bottleneck MLP improves validation log-loss by 2.4 percent in GPT-style models.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"fcea7f1a3b92b22a4f4684b0ab657fbaeffa51e3b61faafacf0d238e7e2661e0"},"source":{"id":"2603.13381","kind":"arxiv","version":3},"verdict":{"id":"e6df9ef0-e814-49b9-a2ee-301c7ae7a456","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T12:58:43.278825Z","strongest_claim":"Experiments on GPT-3 small style models show consistent improvement over the baseline (2.40% lower validation log-loss, 6.81% lower perplexity), comfortably outperforming a model with 12.5% more non-embedding parameters.","one_line_summary":"Nonlinear query projections of the form X + MLP(X) improve transformer performance on small models with only d² + O(d) added parameters.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the basis transformations absorbed by adjacent layers remain stable when the query projection is made nonlinear and that the small MLP does not introduce optimization instabilities at the tested scale.","pith_extraction_headline":"Replacing the linear query projection with identity plus a small bottleneck MLP improves validation log-loss by 2.4 percent in GPT-style models."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2603.13381/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"e6df9ef0-e814-49b9-a2ee-301c7ae7a456"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T01:04:57Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"63fDj0GKtJzs1YllFwpJs7dbolo1T5lr61BXUESCzziKoiNvckJ8glaNrmPiJDsTikNPfUqeuj944NmD9fQMDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T19:21:46.412507Z"},"content_sha256":"e97bf8bb1e38dd8061cbdc8c5a6853b1f8dbf9ecf0f5e0fa4cfa05789b1f3a61","schema_version":"1.0","event_id":"sha256:e97bf8bb1e38dd8061cbdc8c5a6853b1f8dbf9ecf0f5e0fa4cfa05789b1f3a61"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/I3CPQ63NKWO2XCMKN6PTMHPTIJ/bundle.json","state_url":"https://pith.science/pith/I3CPQ63NKWO2XCMKN6PTMHPTIJ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/I3CPQ63NKWO2XCMKN6PTMHPTIJ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-02T19:21:46Z","links":{"resolver":"https://pith.science/pith/I3CPQ63NKWO2XCMKN6PTMHPTIJ","bundle":"https://pith.science/pith/I3CPQ63NKWO2XCMKN6PTMHPTIJ/bundle.json","state":"https://pith.science/pith/I3CPQ63NKWO2XCMKN6PTMHPTIJ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/I3CPQ63NKWO2XCMKN6PTMHPTIJ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:I3CPQ63NKWO2XCMKN6PTMHPTIJ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"08ed397f88febbe07b6a0ffb9f291c4479b617a901fa0ccc23215fbf80cc6831","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-11T03:13:10Z","title_canon_sha256":"c078383ab9c6950b4a8b2163a0b527fe12d53317c4b9abde170fb63f272e271d"},"schema_version":"1.0","source":{"id":"2603.13381","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.13381","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"arxiv_version","alias_value":"2603.13381v3","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.13381","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_12","alias_value":"I3CPQ63NKWO2","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_16","alias_value":"I3CPQ63NKWO2XCMK","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_8","alias_value":"I3CPQ63N","created_at":"2026-05-27T01:04:57Z"}],"graph_snapshots":[{"event_id":"sha256:e97bf8bb1e38dd8061cbdc8c5a6853b1f8dbf9ecf0f5e0fa4cfa05789b1f3a61","target":"graph","created_at":"2026-05-27T01:04:57Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experiments on GPT-3 small style models show consistent improvement over the baseline (2.40% lower validation log-loss, 6.81% lower perplexity), comfortably outperforming a model with 12.5% more non-embedding parameters."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the basis transformations absorbed by adjacent layers remain stable when the query projection is made nonlinear and that the small MLP does not introduce optimization instabilities at the tested scale."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Nonlinear query projections of the form X + MLP(X) improve transformer performance on small models with only d² + O(d) added parameters."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Replacing the linear query projection with identity plus a small bottleneck MLP improves validation log-loss by 2.4 percent in GPT-style models."}],"snapshot_sha256":"fcea7f1a3b92b22a4f4684b0ab657fbaeffa51e3b61faafacf0d238e7e2661e0"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2603.13381/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Recent algebraic analysis shows that in decoder-only and encoder-only transformers, the Query projection $W_Q$ may be set to identity without noticeable performance deterioration. This is possible because attention depends on $X$ only through the products $XW_Q, XW_K, XW_V$, allowing basis transformations to be absorbed by adjacent layers and propagated through the network. We replace $W_Q \\in \\R^{d \\times d}$ with a nonlinear residual of the form $Q(X) = X + f_\\theta(X)$, where $f_\\theta$ is a bottleneck MLP with $d^2 + O(d)$ parameters. The identity term anchors the nonlinearity to a known-g","authors_text":"Marko Karbevski","cross_cats":["cs.AI"],"headline":"Replacing the linear query projection with identity plus a small bottleneck MLP improves validation log-loss by 2.4 percent in GPT-style models.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-11T03:13:10Z","title":"Beyond Linearity in Attention Projections: The Case for Nonlinear Queries"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.13381","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-15T12:58:43.278825Z","id":"e6df9ef0-e814-49b9-a2ee-301c7ae7a456","model_set":{"reader":"grok-4.3"},"one_line_summary":"Nonlinear query projections of the form X + MLP(X) improve transformer performance on small models with only d² + O(d) added parameters.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Replacing the linear query projection with identity plus a small bottleneck MLP improves validation log-loss by 2.4 percent in GPT-style models.","strongest_claim":"Experiments on GPT-3 small style models show consistent improvement over the baseline (2.40% lower validation log-loss, 6.81% lower perplexity), comfortably outperforming a model with 12.5% more non-embedding parameters.","weakest_assumption":"That the basis transformations absorbed by adjacent layers remain stable when the query projection is made nonlinear and that the small MLP does not introduce optimization instabilities at the tested scale."}},"verdict_id":"e6df9ef0-e814-49b9-a2ee-301c7ae7a456"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:7af39d1909329a34067e6ef67c365672822c78e67a00ece1603f9a772dcb9eeb","target":"record","created_at":"2026-05-27T01:04:57Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"08ed397f88febbe07b6a0ffb9f291c4479b617a901fa0ccc23215fbf80cc6831","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-11T03:13:10Z","title_canon_sha256":"c078383ab9c6950b4a8b2163a0b527fe12d53317c4b9abde170fb63f272e271d"},"schema_version":"1.0","source":{"id":"2603.13381","kind":"arxiv","version":3}},"canonical_sha256":"46c4f87b6d559dab898a6f9f361df3427f32d746d42c902acd9afcbbdb567555","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"46c4f87b6d559dab898a6f9f361df3427f32d746d42c902acd9afcbbdb567555","first_computed_at":"2026-05-27T01:04:57.215379Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-27T01:04:57.215379Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"gxLEpGj2G7tg883sdqDfmVSmouYwWN2dN5dKowmClx2Y8EIlMgECFpiaiBi5lj6eB71Gmz7/o4qwK918obNGDw==","signature_status":"signed_v1","signed_at":"2026-05-27T01:04:57.216108Z","signed_message":"canonical_sha256_bytes"},"source_id":"2603.13381","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:7af39d1909329a34067e6ef67c365672822c78e67a00ece1603f9a772dcb9eeb","sha256:e97bf8bb1e38dd8061cbdc8c5a6853b1f8dbf9ecf0f5e0fa4cfa05789b1f3a61"],"state_sha256":"b2e7e7e668c4a6753622dd93254ec58cd7c8251672703f72de88470184852662"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"yDQikOd+hf6N18Dx58hy0cEuHLSAyMC95W2sy8QiKKuXezIgveOMAz/DkFQ0y6/EQoCfy1kzTJUnIXzYVDz1Cg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-02T19:21:46.414970Z","bundle_sha256":"c6744101d5b8625f5d41d9378f193e1a88bda34b8467f1b489aec435e641d1db"}}