{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:FUFDSRD3D3L6JH5ZON3UTXQNMF","short_pith_number":"pith:FUFDSRD3","canonical_record":{"source":{"id":"2512.07112","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-12-08T02:48:27Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"c2b5ebb7aeedeb1e788ad17d47c1b122eaa380ef047fc73c92a19c32a41e5934","abstract_canon_sha256":"bc3a1b6461b9e2f694a1e33e34d21e9bf8091cce6cadfc1313fb38a0ee391160"},"schema_version":"1.0"},"canonical_sha256":"2d0a39447b1ed7e49fb9737749de0d61451f92d2afb826d901418a62cae879e2","source":{"kind":"arxiv","id":"2512.07112","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2512.07112","created_at":"2026-05-18T02:45:12Z"},{"alias_kind":"arxiv_version","alias_value":"2512.07112v2","created_at":"2026-05-18T02:45:12Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.07112","created_at":"2026-05-18T02:45:12Z"},{"alias_kind":"pith_short_12","alias_value":"FUFDSRD3D3L6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"FUFDSRD3D3L6JH5Z","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"FUFDSRD3","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:FUFDSRD3D3L6JH5ZON3UTXQNMF","target":"record","payload":{"canonical_record":{"source":{"id":"2512.07112","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-12-08T02:48:27Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"c2b5ebb7aeedeb1e788ad17d47c1b122eaa380ef047fc73c92a19c32a41e5934","abstract_canon_sha256":"bc3a1b6461b9e2f694a1e33e34d21e9bf8091cce6cadfc1313fb38a0ee391160"},"schema_version":"1.0"},"canonical_sha256":"2d0a39447b1ed7e49fb9737749de0d61451f92d2afb826d901418a62cae879e2","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:45:12.174731Z","signature_b64":"+4mGNYCPrRBQbM7XPkdDyLX4HTAGMpN27O3VhAgMSKkm0zLMWEYFz1TSdYcz5WoHhCh8E2YhDWD6Ip8tf2hABw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2d0a39447b1ed7e49fb9737749de0d61451f92d2afb826d901418a62cae879e2","last_reissued_at":"2026-05-18T02:45:12.174216Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:45:12.174216Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2512.07112","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:45:12Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"vPlrH7kIMdea1rEzAZDTolWOIr+DdOUpck43r20/x/8VzeovwszFbEs9WUWmJ75R8Crh/l26mebBBIrWgyoTCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-07T18:21:10.245009Z"},"content_sha256":"1e7fa81069181939935d046e4951d846ef87b347d9fd89147db625a0c9bff676","schema_version":"1.0","event_id":"sha256:1e7fa81069181939935d046e4951d846ef87b347d9fd89147db625a0c9bff676"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:FUFDSRD3D3L6JH5ZON3UTXQNMF","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"FOAM: Blocked State Folding for Memory-Efficient LLM Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"FOAM compresses Adam optimizer states via block-wise gradient means and residual corrections to match full convergence while cutting memory overhead by up to 90 percent.","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Dongsheng Li, Jiahuan Wang, Ping Luo, Tao Sun, Ziqing Wen","submitted_at":"2025-12-08T02:48:27Z","abstract_excerpt":"Large language models (LLMs) have demonstrated remarkable performance due to their large parameter counts and extensive training data. However, their scale leads to significant memory bottlenecks during training, especially when using memory-intensive optimizers like Adam. Existing memory-efficient approaches often rely on techniques such as singular value decomposition (SVD), projections, or weight freezing, which can introduce substantial computational overhead, require additional memory for projections, or degrade model performance. In this paper, we propose Folded Optimizer with Approximat"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Theoretically, FOAM achieves convergence rates equivalent to vanilla Adam under standard non-convex optimization settings.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The block-wise gradient mean plus residual correction preserves sufficient information that the convergence analysis for Adam still applies without additional bias or variance terms that would invalidate the rate.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"FOAM folds Adam optimizer states into block-wise gradient means with residual correction, cutting memory overhead by up to 90% while matching vanilla Adam convergence rates under standard non-convex assumptions.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"FOAM compresses Adam optimizer states via block-wise gradient means and residual corrections to match full convergence while cutting memory overhead by up to 90 percent.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"50f14e49c49e03ffcbe9db4efa7a0a333a95207ef72013ef4f3ae50448c4219a"},"source":{"id":"2512.07112","kind":"arxiv","version":2},"verdict":{"id":"bf0988a7-9139-4781-8fb6-014beb1368e6","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T00:43:16.536911Z","strongest_claim":"Theoretically, FOAM achieves convergence rates equivalent to vanilla Adam under standard non-convex optimization settings.","one_line_summary":"FOAM folds Adam optimizer states into block-wise gradient means with residual correction, cutting memory overhead by up to 90% while matching vanilla Adam convergence rates under standard non-convex assumptions.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The block-wise gradient mean plus residual correction preserves sufficient information that the convergence analysis for Adam still applies without additional bias or variance terms that would invalidate the rate.","pith_extraction_headline":"FOAM compresses Adam optimizer states via block-wise gradient means and residual corrections to match full convergence while cutting memory overhead by up to 90 percent."},"references":{"count":4,"sample":[{"doi":"","year":2014,"title":"∆t√Vt +ϵ 2# = 3c1 4 ∥∇f(W t)∥2 − 1 c1 E","work_id":"f2011251-f768-4bb3-8983-82a0f432f6ec","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2048,"title":"These details are provided in Table 10","work_id":"ea250ddf-7f5d-4301-b448-41bf8b99b911","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"models. For each model, we adjust the learning rate within the range {5e-4, 1e-3, 2.5e-3, 5e-3, 1e-2}, keeping the memory-efficient scaling factors unchanged, and train for a total of 20k iterations, ","work_id":"f7a4aaac-14bd-4460-b656-bc763abd6b44","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"use a hybrid optimizer setup—employing vanilla Adam for modules like Embeddings and LayerNorm, while applying compressed-state optimization to Attention and MLP modules, with a scaling factor α used t","work_id":"dfd455e9-fb0d-4cfe-acdc-b55b4f721dc5","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":4,"snapshot_sha256":"ed41f0c13ac86460d43ffe6286a3ca6b139ef5fc4ac869517a4064474df11b61","internal_anchors":0},"formal_canon":{"evidence_count":1,"snapshot_sha256":"b269f90143529a901cbf72f2ad04b03ad33eb606d514a6883e622265a8f5489e"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"bf0988a7-9139-4781-8fb6-014beb1368e6"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:45:12Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"geCQhz1ZoY4ZzGbwTG2DqrT87OpGEJYkkJXRAl7xI4jk4yz6n6WhKepTWZIFBls/Ev01ewdkUxjRVkrNrejZDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-07T18:21:10.246089Z"},"content_sha256":"a40c79eaf4b425391ef8b8589953856fcc4743a6f8c4a0a7d2b4fa26c8c73d64","schema_version":"1.0","event_id":"sha256:a40c79eaf4b425391ef8b8589953856fcc4743a6f8c4a0a7d2b4fa26c8c73d64"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/FUFDSRD3D3L6JH5ZON3UTXQNMF/bundle.json","state_url":"https://pith.science/pith/FUFDSRD3D3L6JH5ZON3UTXQNMF/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/FUFDSRD3D3L6JH5ZON3UTXQNMF/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-07T18:21:10Z","links":{"resolver":"https://pith.science/pith/FUFDSRD3D3L6JH5ZON3UTXQNMF","bundle":"https://pith.science/pith/FUFDSRD3D3L6JH5ZON3UTXQNMF/bundle.json","state":"https://pith.science/pith/FUFDSRD3D3L6JH5ZON3UTXQNMF/state.json","well_known_bundle":"https://pith.science/.well-known/pith/FUFDSRD3D3L6JH5ZON3UTXQNMF/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:FUFDSRD3D3L6JH5ZON3UTXQNMF","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"bc3a1b6461b9e2f694a1e33e34d21e9bf8091cce6cadfc1313fb38a0ee391160","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-12-08T02:48:27Z","title_canon_sha256":"c2b5ebb7aeedeb1e788ad17d47c1b122eaa380ef047fc73c92a19c32a41e5934"},"schema_version":"1.0","source":{"id":"2512.07112","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2512.07112","created_at":"2026-05-18T02:45:12Z"},{"alias_kind":"arxiv_version","alias_value":"2512.07112v2","created_at":"2026-05-18T02:45:12Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.07112","created_at":"2026-05-18T02:45:12Z"},{"alias_kind":"pith_short_12","alias_value":"FUFDSRD3D3L6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"FUFDSRD3D3L6JH5Z","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"FUFDSRD3","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:a40c79eaf4b425391ef8b8589953856fcc4743a6f8c4a0a7d2b4fa26c8c73d64","target":"graph","created_at":"2026-05-18T02:45:12Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Theoretically, FOAM achieves convergence rates equivalent to vanilla Adam under standard non-convex optimization settings."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The block-wise gradient mean plus residual correction preserves sufficient information that the convergence analysis for Adam still applies without additional bias or variance terms that would invalidate the rate."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"FOAM folds Adam optimizer states into block-wise gradient means with residual correction, cutting memory overhead by up to 90% while matching vanilla Adam convergence rates under standard non-convex assumptions."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"FOAM compresses Adam optimizer states via block-wise gradient means and residual corrections to match full convergence while cutting memory overhead by up to 90 percent."}],"snapshot_sha256":"50f14e49c49e03ffcbe9db4efa7a0a333a95207ef72013ef4f3ae50448c4219a"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"b269f90143529a901cbf72f2ad04b03ad33eb606d514a6883e622265a8f5489e"},"paper":{"abstract_excerpt":"Large language models (LLMs) have demonstrated remarkable performance due to their large parameter counts and extensive training data. However, their scale leads to significant memory bottlenecks during training, especially when using memory-intensive optimizers like Adam. Existing memory-efficient approaches often rely on techniques such as singular value decomposition (SVD), projections, or weight freezing, which can introduce substantial computational overhead, require additional memory for projections, or degrade model performance. In this paper, we propose Folded Optimizer with Approximat","authors_text":"Dongsheng Li, Jiahuan Wang, Ping Luo, Tao Sun, Ziqing Wen","cross_cats":["cs.AI"],"headline":"FOAM compresses Adam optimizer states via block-wise gradient means and residual corrections to match full convergence while cutting memory overhead by up to 90 percent.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-12-08T02:48:27Z","title":"FOAM: Blocked State Folding for Memory-Efficient LLM Training"},"references":{"count":4,"internal_anchors":0,"resolved_work":4,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"∆t√Vt +ϵ 2# = 3c1 4 ∥∇f(W t)∥2 − 1 c1 E","work_id":"f2011251-f768-4bb3-8983-82a0f432f6ec","year":2014},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"These details are provided in Table 10","work_id":"ea250ddf-7f5d-4301-b448-41bf8b99b911","year":2048},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"models. For each model, we adjust the learning rate within the range {5e-4, 1e-3, 2.5e-3, 5e-3, 1e-2}, keeping the memory-efficient scaling factors unchanged, and train for a total of 20k iterations, ","work_id":"f7a4aaac-14bd-4460-b656-bc763abd6b44","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"use a hybrid optimizer setup—employing vanilla Adam for modules like Embeddings and LayerNorm, while applying compressed-state optimization to Attention and MLP modules, with a scaling factor α used t","work_id":"dfd455e9-fb0d-4cfe-acdc-b55b4f721dc5","year":2025}],"snapshot_sha256":"ed41f0c13ac86460d43ffe6286a3ca6b139ef5fc4ac869517a4064474df11b61"},"source":{"id":"2512.07112","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-17T00:43:16.536911Z","id":"bf0988a7-9139-4781-8fb6-014beb1368e6","model_set":{"reader":"grok-4.3"},"one_line_summary":"FOAM folds Adam optimizer states into block-wise gradient means with residual correction, cutting memory overhead by up to 90% while matching vanilla Adam convergence rates under standard non-convex assumptions.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"FOAM compresses Adam optimizer states via block-wise gradient means and residual corrections to match full convergence while cutting memory overhead by up to 90 percent.","strongest_claim":"Theoretically, FOAM achieves convergence rates equivalent to vanilla Adam under standard non-convex optimization settings.","weakest_assumption":"The block-wise gradient mean plus residual correction preserves sufficient information that the convergence analysis for Adam still applies without additional bias or variance terms that would invalidate the rate."}},"verdict_id":"bf0988a7-9139-4781-8fb6-014beb1368e6"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:1e7fa81069181939935d046e4951d846ef87b347d9fd89147db625a0c9bff676","target":"record","created_at":"2026-05-18T02:45:12Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"bc3a1b6461b9e2f694a1e33e34d21e9bf8091cce6cadfc1313fb38a0ee391160","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-12-08T02:48:27Z","title_canon_sha256":"c2b5ebb7aeedeb1e788ad17d47c1b122eaa380ef047fc73c92a19c32a41e5934"},"schema_version":"1.0","source":{"id":"2512.07112","kind":"arxiv","version":2}},"canonical_sha256":"2d0a39447b1ed7e49fb9737749de0d61451f92d2afb826d901418a62cae879e2","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"2d0a39447b1ed7e49fb9737749de0d61451f92d2afb826d901418a62cae879e2","first_computed_at":"2026-05-18T02:45:12.174216Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:45:12.174216Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"+4mGNYCPrRBQbM7XPkdDyLX4HTAGMpN27O3VhAgMSKkm0zLMWEYFz1TSdYcz5WoHhCh8E2YhDWD6Ip8tf2hABw==","signature_status":"signed_v1","signed_at":"2026-05-18T02:45:12.174731Z","signed_message":"canonical_sha256_bytes"},"source_id":"2512.07112","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:1e7fa81069181939935d046e4951d846ef87b347d9fd89147db625a0c9bff676","sha256:a40c79eaf4b425391ef8b8589953856fcc4743a6f8c4a0a7d2b4fa26c8c73d64"],"state_sha256":"e0d19b5096d5ff078ac0ce513a3606365ebf6064962e558cae26e3595b56ccac"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"vwJ066ghoYTyYOtU0HHdjVoJE7aAVlRr/guL2/oQHUMrn5ikwV7vQXZa0moyGjJyH+PgESAebIfV/o37dtfsDA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-07T18:21:10.251054Z","bundle_sha256":"48308bb32004db7d12010d80ab305a0bbb2e9dc3215adcfde2b473dde8050915"}}