{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:2OKGTL5HUEDCY3JJ3ZDWUMELAO","short_pith_number":"pith:2OKGTL5H","canonical_record":{"source":{"id":"2412.12636","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.DC","submitted_at":"2024-12-17T07:59:31Z","cross_cats_sorted":["cs.AI","cs.LG","cs.PF"],"title_canon_sha256":"5a23546ddcfbe8f6d331576cbf033311683857ab6c6b0c96027edad8ac298b3c","abstract_canon_sha256":"615d36d9a92f76674430e9858df476f1fa7a165ed2ef76a6ffc8f4696e6206f7"},"schema_version":"1.0"},"canonical_sha256":"d39469afa7a1062c6d29de476a308b039a1889fbc7e98fb165ed424b108ecf27","source":{"kind":"arxiv","id":"2412.12636","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2412.12636","created_at":"2026-05-20T00:00:17Z"},{"alias_kind":"arxiv_version","alias_value":"2412.12636v3","created_at":"2026-05-20T00:00:17Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.12636","created_at":"2026-05-20T00:00:17Z"},{"alias_kind":"pith_short_12","alias_value":"2OKGTL5HUEDC","created_at":"2026-05-20T00:00:17Z"},{"alias_kind":"pith_short_16","alias_value":"2OKGTL5HUEDCY3JJ","created_at":"2026-05-20T00:00:17Z"},{"alias_kind":"pith_short_8","alias_value":"2OKGTL5H","created_at":"2026-05-20T00:00:17Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:2OKGTL5HUEDCY3JJ3ZDWUMELAO","target":"record","payload":{"canonical_record":{"source":{"id":"2412.12636","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.DC","submitted_at":"2024-12-17T07:59:31Z","cross_cats_sorted":["cs.AI","cs.LG","cs.PF"],"title_canon_sha256":"5a23546ddcfbe8f6d331576cbf033311683857ab6c6b0c96027edad8ac298b3c","abstract_canon_sha256":"615d36d9a92f76674430e9858df476f1fa7a165ed2ef76a6ffc8f4696e6206f7"},"schema_version":"1.0"},"canonical_sha256":"d39469afa7a1062c6d29de476a308b039a1889fbc7e98fb165ed424b108ecf27","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:00:17.189591Z","signature_b64":"yTxv6Mpz9NfGAf/6DoPfUpF8aqCaGUjWxXKcNP3XPdT6jD+9kKpdU1hTI12l5l2x5UM91m5AeJPhsCEAjV8ADw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d39469afa7a1062c6d29de476a308b039a1889fbc7e98fb165ed424b108ecf27","last_reissued_at":"2026-05-20T00:00:17.188776Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:00:17.188776Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2412.12636","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:00:17Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"lEMzUchCUp27Ah1RzrnmZgPEJ/+CdOy2UlhoHqt2J3FydFjVJUmntxx1IOyf7PAKCZzu1sZnPG5UuKbjahefAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T17:58:58.992438Z"},"content_sha256":"e0651b705292721f80128007e8a402b6f34110367ac799d5cd73c091a546d750","schema_version":"1.0","event_id":"sha256:e0651b705292721f80128007e8a402b6f34110367ac799d5cd73c091a546d750"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:2OKGTL5HUEDCY3JJ3ZDWUMELAO","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"TrainMover: An Interruption-Resilient Runtime for ML Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG","cs.PF"],"primary_cat":"cs.DC","authors_text":"Aditya Akella, ChonLam Lao, Dennis Cai, Ennan Zhai, Jiamin Cao, Jiangfei Duan, Jiaqi Gao, Jingren Zhou, Minlan Yu, Pengcheng Zhang, Yichi Xu, Yong Li, Yu Guan, Zhengping Qian, Zhilong Zheng, Zhipeng Zhang","submitted_at":"2024-12-17T07:59:31Z","abstract_excerpt":"Large-scale ML training jobs are frequently interrupted by hardware and software anomalies, failures, and management events. Existing solutions like checkpoint-restart or runtime reconfiguration suffer from long downtimes and degraded performance. We present TrainMover, a resilient LLM training runtime that leverages elastic and standby machines to handle interruptions with minimal downtime and zero memory overhead. To achieve these goals, TrainMover introduces three key techniques: two-phase, delta-based communication group setup; communication-free sandboxed warmup; and general standby desig"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2412.12636","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2412.12636/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:00:17Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ifQPdAGV3u1RZX/SNZd8y2KPZsrsDEq7hayLUGppcN3KfLMuUWmIDxz+1mb1hbYuss8Ue6yaCn1E0hloqFDnBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T17:58:58.993132Z"},"content_sha256":"04b31cfc7f2919f5c71a40bce90415604cc135bbaec0039237f8d040df769b51","schema_version":"1.0","event_id":"sha256:04b31cfc7f2919f5c71a40bce90415604cc135bbaec0039237f8d040df769b51"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/2OKGTL5HUEDCY3JJ3ZDWUMELAO/bundle.json","state_url":"https://pith.science/pith/2OKGTL5HUEDCY3JJ3ZDWUMELAO/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/2OKGTL5HUEDCY3JJ3ZDWUMELAO/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T17:58:58Z","links":{"resolver":"https://pith.science/pith/2OKGTL5HUEDCY3JJ3ZDWUMELAO","bundle":"https://pith.science/pith/2OKGTL5HUEDCY3JJ3ZDWUMELAO/bundle.json","state":"https://pith.science/pith/2OKGTL5HUEDCY3JJ3ZDWUMELAO/state.json","well_known_bundle":"https://pith.science/.well-known/pith/2OKGTL5HUEDCY3JJ3ZDWUMELAO/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:2OKGTL5HUEDCY3JJ3ZDWUMELAO","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"615d36d9a92f76674430e9858df476f1fa7a165ed2ef76a6ffc8f4696e6206f7","cross_cats_sorted":["cs.AI","cs.LG","cs.PF"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.DC","submitted_at":"2024-12-17T07:59:31Z","title_canon_sha256":"5a23546ddcfbe8f6d331576cbf033311683857ab6c6b0c96027edad8ac298b3c"},"schema_version":"1.0","source":{"id":"2412.12636","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2412.12636","created_at":"2026-05-20T00:00:17Z"},{"alias_kind":"arxiv_version","alias_value":"2412.12636v3","created_at":"2026-05-20T00:00:17Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.12636","created_at":"2026-05-20T00:00:17Z"},{"alias_kind":"pith_short_12","alias_value":"2OKGTL5HUEDC","created_at":"2026-05-20T00:00:17Z"},{"alias_kind":"pith_short_16","alias_value":"2OKGTL5HUEDCY3JJ","created_at":"2026-05-20T00:00:17Z"},{"alias_kind":"pith_short_8","alias_value":"2OKGTL5H","created_at":"2026-05-20T00:00:17Z"}],"graph_snapshots":[{"event_id":"sha256:04b31cfc7f2919f5c71a40bce90415604cc135bbaec0039237f8d040df769b51","target":"graph","created_at":"2026-05-20T00:00:17Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2412.12636/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Large-scale ML training jobs are frequently interrupted by hardware and software anomalies, failures, and management events. Existing solutions like checkpoint-restart or runtime reconfiguration suffer from long downtimes and degraded performance. We present TrainMover, a resilient LLM training runtime that leverages elastic and standby machines to handle interruptions with minimal downtime and zero memory overhead. To achieve these goals, TrainMover introduces three key techniques: two-phase, delta-based communication group setup; communication-free sandboxed warmup; and general standby desig","authors_text":"Aditya Akella, ChonLam Lao, Dennis Cai, Ennan Zhai, Jiamin Cao, Jiangfei Duan, Jiaqi Gao, Jingren Zhou, Minlan Yu, Pengcheng Zhang, Yichi Xu, Yong Li, Yu Guan, Zhengping Qian, Zhilong Zheng, Zhipeng Zhang","cross_cats":["cs.AI","cs.LG","cs.PF"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.DC","submitted_at":"2024-12-17T07:59:31Z","title":"TrainMover: An Interruption-Resilient Runtime for ML Training"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2412.12636","kind":"arxiv","version":3},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:e0651b705292721f80128007e8a402b6f34110367ac799d5cd73c091a546d750","target":"record","created_at":"2026-05-20T00:00:17Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"615d36d9a92f76674430e9858df476f1fa7a165ed2ef76a6ffc8f4696e6206f7","cross_cats_sorted":["cs.AI","cs.LG","cs.PF"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.DC","submitted_at":"2024-12-17T07:59:31Z","title_canon_sha256":"5a23546ddcfbe8f6d331576cbf033311683857ab6c6b0c96027edad8ac298b3c"},"schema_version":"1.0","source":{"id":"2412.12636","kind":"arxiv","version":3}},"canonical_sha256":"d39469afa7a1062c6d29de476a308b039a1889fbc7e98fb165ed424b108ecf27","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"d39469afa7a1062c6d29de476a308b039a1889fbc7e98fb165ed424b108ecf27","first_computed_at":"2026-05-20T00:00:17.188776Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:00:17.188776Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"yTxv6Mpz9NfGAf/6DoPfUpF8aqCaGUjWxXKcNP3XPdT6jD+9kKpdU1hTI12l5l2x5UM91m5AeJPhsCEAjV8ADw==","signature_status":"signed_v1","signed_at":"2026-05-20T00:00:17.189591Z","signed_message":"canonical_sha256_bytes"},"source_id":"2412.12636","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:e0651b705292721f80128007e8a402b6f34110367ac799d5cd73c091a546d750","sha256:04b31cfc7f2919f5c71a40bce90415604cc135bbaec0039237f8d040df769b51"],"state_sha256":"2f042c61a0d840e91a8e13f0e1c0c7affacc95cf4641f1728b658880445b6f52"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"EzP37XbZVJAx3OjfPJa4aRWjJwDN8Ge8R5XNYN8WNRyfZIO9MNSybf3xu0nO6h78sGDUXwdBZ50rsgf1FoQHBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T17:58:58.996497Z","bundle_sha256":"986bf219b947b1b6eb598d172e6ca05998782d3b8f7792a0e163fa23b84bba90"}}