{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:FCIVDTSE6VC5MSH75IS7VYCXE2","short_pith_number":"pith:FCIVDTSE","canonical_record":{"source":{"id":"2511.09861","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DC","submitted_at":"2025-11-13T01:41:47Z","cross_cats_sorted":["cs.AR"],"title_canon_sha256":"3f2f23862e808398126cf576b91c3a515d18a5f7fc16dbc02343cb89c5f5ba21","abstract_canon_sha256":"8853a9e3d2ecf7d02d6fcd4690f20a373655e8865728ec3109d1dcaa20dc187c"},"schema_version":"1.0"},"canonical_sha256":"289151ce44f545d648ffea25fae057268f810a911bb84d17410b4f29641fd649","source":{"kind":"arxiv","id":"2511.09861","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2511.09861","created_at":"2026-05-18T03:09:33Z"},{"alias_kind":"arxiv_version","alias_value":"2511.09861v3","created_at":"2026-05-18T03:09:33Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.09861","created_at":"2026-05-18T03:09:33Z"},{"alias_kind":"pith_short_12","alias_value":"FCIVDTSE6VC5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"FCIVDTSE6VC5MSH7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"FCIVDTSE","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:FCIVDTSE6VC5MSH75IS7VYCXE2","target":"record","payload":{"canonical_record":{"source":{"id":"2511.09861","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DC","submitted_at":"2025-11-13T01:41:47Z","cross_cats_sorted":["cs.AR"],"title_canon_sha256":"3f2f23862e808398126cf576b91c3a515d18a5f7fc16dbc02343cb89c5f5ba21","abstract_canon_sha256":"8853a9e3d2ecf7d02d6fcd4690f20a373655e8865728ec3109d1dcaa20dc187c"},"schema_version":"1.0"},"canonical_sha256":"289151ce44f545d648ffea25fae057268f810a911bb84d17410b4f29641fd649","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:09:33.292749Z","signature_b64":"1CYG5eSk8uAlxX4i8h4EeHnilW2z5k+4mlImnodpWyKkMecWGDlxYsslaw98yyLTCiizHpicaqdvRU4fxDksDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"289151ce44f545d648ffea25fae057268f810a911bb84d17410b4f29641fd649","last_reissued_at":"2026-05-18T03:09:33.292027Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:09:33.292027Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2511.09861","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:33Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"W48bh+/WUL+qqajMQhFDKKpyFk9iecHKuJMAIj5ksFU+3CB6vzt0JWmNg9ICwJERNU/uLknh/SRpKPnLe3IrDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T20:00:33.232484Z"},"content_sha256":"2b8ca8d8d352eec19e912469e1736a099abd6becd22a93c2ee19e4a4f6acebf1","schema_version":"1.0","event_id":"sha256:2b8ca8d8d352eec19e912469e1736a099abd6becd22a93c2ee19e4a4f6acebf1"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:FCIVDTSE6VC5MSH75IS7VYCXE2","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Lit Silicon: A Case Where Thermal Imbalance Couples Concurrent Execution in Multiple GPUs","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Thermal imbalance across GPUs introduces stragglers that slow down the system when using concurrent computation and communication.","cross_cats":["cs.AR"],"primary_cat":"cs.DC","authors_text":"Di Wu, Marco Kurzynski, Shaizeen Aga","submitted_at":"2025-11-13T01:41:47Z","abstract_excerpt":"GPU systems are increasingly powering modern datacenters at scale. Despite being highly performant, GPU systems can exhibit performance variation at the node and cluster levels. Such performance variation can significantly impact both high-performance computing and artificial intelligence workloads, such as cutting-edge large language models (LLMs). In this work, we analyze the performance of a single-node multi-GPU system running LLM training, and observe that the kernel-level performance variation is highly correlated with concurrent computation and communication (C3), a technique to overlap"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Thermally induced straggling coupled with concurrent computation and communication (C3) impacts performance variation, which we coin the Lit Silicon effect. More specifically, Lit Silicon describes that in a multi-GPU node, thermal imbalance across GPUs can introduce node-level straggler GPUs (hotter and slower), which in turn slow down the leader GPUs (cooler and faster).","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the observed kernel-level performance variation is primarily caused by thermal imbalance interacting with C3 rather than other factors such as workload imbalance, interconnect variability, or unmeasured hardware differences.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Thermal imbalance in multi-GPU nodes creates hotter straggler GPUs that slow down cooler leader GPUs during overlapped computation and communication in LLM training.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Thermal imbalance across GPUs introduces stragglers that slow down the system when using concurrent computation and communication.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b7b793acefede3f1eb32eac7896e1ef1e044eb8675dcec5da43ce80f3848cf5c"},"source":{"id":"2511.09861","kind":"arxiv","version":3},"verdict":{"id":"0e7b87ea-94cc-402e-bb2d-13fb75687701","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T23:07:45.952214Z","strongest_claim":"Thermally induced straggling coupled with concurrent computation and communication (C3) impacts performance variation, which we coin the Lit Silicon effect. More specifically, Lit Silicon describes that in a multi-GPU node, thermal imbalance across GPUs can introduce node-level straggler GPUs (hotter and slower), which in turn slow down the leader GPUs (cooler and faster).","one_line_summary":"Thermal imbalance in multi-GPU nodes creates hotter straggler GPUs that slow down cooler leader GPUs during overlapped computation and communication in LLM training.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the observed kernel-level performance variation is primarily caused by thermal imbalance interacting with C3 rather than other factors such as workload imbalance, interconnect variability, or unmeasured hardware differences.","pith_extraction_headline":"Thermal imbalance across GPUs introduces stragglers that slow down the system when using concurrent computation and communication."},"references":{"count":56,"sample":[{"doi":"","year":1994,"title":"A High-Performance Matrix-Multiplication Algorithm on a Distributed-Memory Parallel Computer, Using Overlapped Communication ,","work_id":"675777de-42ed-44d4-8787-0148cb4382eb","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"ConCCL: Optimizing ML Concurrent Computation and Communication with GPU DMA Engines,","work_id":"05ef745d-b172-4c89-a925-bb98fdb0ade1","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2010,"title":"Accelerating SQL database operations on a GPU with CUDA,","work_id":"3fda7026-9a89-441c-a17a-fedf79f7fd1c","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2005,"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","ref_index":4,"cited_arxiv_id":"2005.14165","is_internal_anchor":true},{"doi":"","year":2023,"title":"GPU Database Systems Characterization and Optimization,","work_id":"5ba2286b-8afb-484a-91ec-7868b513c2e5","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":56,"snapshot_sha256":"4bf681e172d389e0d59f10e0644c25f30c8a123be76337b941e2377f9e4f7f8b","internal_anchors":7},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"0e7b87ea-94cc-402e-bb2d-13fb75687701"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:33Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"kIjUjBhHSGeADRQlnxJ1ugb89q1QN2ZDYBYXpYDars+d3BTlgLWkV2JjTaEydhXD7xjGJ5rTVO1nsyVQfxH9Ag==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T20:00:33.233456Z"},"content_sha256":"bf26df15ff2ec833559ec9c01733241bdde446ba90b3bcb8998826893743cbd6","schema_version":"1.0","event_id":"sha256:bf26df15ff2ec833559ec9c01733241bdde446ba90b3bcb8998826893743cbd6"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/FCIVDTSE6VC5MSH75IS7VYCXE2/bundle.json","state_url":"https://pith.science/pith/FCIVDTSE6VC5MSH75IS7VYCXE2/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/FCIVDTSE6VC5MSH75IS7VYCXE2/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-18T20:00:33Z","links":{"resolver":"https://pith.science/pith/FCIVDTSE6VC5MSH75IS7VYCXE2","bundle":"https://pith.science/pith/FCIVDTSE6VC5MSH75IS7VYCXE2/bundle.json","state":"https://pith.science/pith/FCIVDTSE6VC5MSH75IS7VYCXE2/state.json","well_known_bundle":"https://pith.science/.well-known/pith/FCIVDTSE6VC5MSH75IS7VYCXE2/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:FCIVDTSE6VC5MSH75IS7VYCXE2","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"8853a9e3d2ecf7d02d6fcd4690f20a373655e8865728ec3109d1dcaa20dc187c","cross_cats_sorted":["cs.AR"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DC","submitted_at":"2025-11-13T01:41:47Z","title_canon_sha256":"3f2f23862e808398126cf576b91c3a515d18a5f7fc16dbc02343cb89c5f5ba21"},"schema_version":"1.0","source":{"id":"2511.09861","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2511.09861","created_at":"2026-05-18T03:09:33Z"},{"alias_kind":"arxiv_version","alias_value":"2511.09861v3","created_at":"2026-05-18T03:09:33Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.09861","created_at":"2026-05-18T03:09:33Z"},{"alias_kind":"pith_short_12","alias_value":"FCIVDTSE6VC5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"FCIVDTSE6VC5MSH7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"FCIVDTSE","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:bf26df15ff2ec833559ec9c01733241bdde446ba90b3bcb8998826893743cbd6","target":"graph","created_at":"2026-05-18T03:09:33Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Thermally induced straggling coupled with concurrent computation and communication (C3) impacts performance variation, which we coin the Lit Silicon effect. More specifically, Lit Silicon describes that in a multi-GPU node, thermal imbalance across GPUs can introduce node-level straggler GPUs (hotter and slower), which in turn slow down the leader GPUs (cooler and faster)."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the observed kernel-level performance variation is primarily caused by thermal imbalance interacting with C3 rather than other factors such as workload imbalance, interconnect variability, or unmeasured hardware differences."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Thermal imbalance in multi-GPU nodes creates hotter straggler GPUs that slow down cooler leader GPUs during overlapped computation and communication in LLM training."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Thermal imbalance across GPUs introduces stragglers that slow down the system when using concurrent computation and communication."}],"snapshot_sha256":"b7b793acefede3f1eb32eac7896e1ef1e044eb8675dcec5da43ce80f3848cf5c"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"GPU systems are increasingly powering modern datacenters at scale. Despite being highly performant, GPU systems can exhibit performance variation at the node and cluster levels. Such performance variation can significantly impact both high-performance computing and artificial intelligence workloads, such as cutting-edge large language models (LLMs). In this work, we analyze the performance of a single-node multi-GPU system running LLM training, and observe that the kernel-level performance variation is highly correlated with concurrent computation and communication (C3), a technique to overlap","authors_text":"Di Wu, Marco Kurzynski, Shaizeen Aga","cross_cats":["cs.AR"],"headline":"Thermal imbalance across GPUs introduces stragglers that slow down the system when using concurrent computation and communication.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DC","submitted_at":"2025-11-13T01:41:47Z","title":"Lit Silicon: A Case Where Thermal Imbalance Couples Concurrent Execution in Multiple GPUs"},"references":{"count":56,"internal_anchors":7,"resolved_work":56,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"A High-Performance Matrix-Multiplication Algorithm on a Distributed-Memory Parallel Computer, Using Overlapped Communication ,","work_id":"675777de-42ed-44d4-8787-0148cb4382eb","year":1994},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"ConCCL: Optimizing ML Concurrent Computation and Communication with GPU DMA Engines,","work_id":"05ef745d-b172-4c89-a925-bb98fdb0ade1","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Accelerating SQL database operations on a GPU with CUDA,","work_id":"3fda7026-9a89-441c-a17a-fedf79f7fd1c","year":2010},{"cited_arxiv_id":"2005.14165","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","year":2005},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"GPU Database Systems Characterization and Optimization,","work_id":"5ba2286b-8afb-484a-91ec-7868b513c2e5","year":2023}],"snapshot_sha256":"4bf681e172d389e0d59f10e0644c25f30c8a123be76337b941e2377f9e4f7f8b"},"source":{"id":"2511.09861","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-17T23:07:45.952214Z","id":"0e7b87ea-94cc-402e-bb2d-13fb75687701","model_set":{"reader":"grok-4.3"},"one_line_summary":"Thermal imbalance in multi-GPU nodes creates hotter straggler GPUs that slow down cooler leader GPUs during overlapped computation and communication in LLM training.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Thermal imbalance across GPUs introduces stragglers that slow down the system when using concurrent computation and communication.","strongest_claim":"Thermally induced straggling coupled with concurrent computation and communication (C3) impacts performance variation, which we coin the Lit Silicon effect. More specifically, Lit Silicon describes that in a multi-GPU node, thermal imbalance across GPUs can introduce node-level straggler GPUs (hotter and slower), which in turn slow down the leader GPUs (cooler and faster).","weakest_assumption":"That the observed kernel-level performance variation is primarily caused by thermal imbalance interacting with C3 rather than other factors such as workload imbalance, interconnect variability, or unmeasured hardware differences."}},"verdict_id":"0e7b87ea-94cc-402e-bb2d-13fb75687701"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2b8ca8d8d352eec19e912469e1736a099abd6becd22a93c2ee19e4a4f6acebf1","target":"record","created_at":"2026-05-18T03:09:33Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"8853a9e3d2ecf7d02d6fcd4690f20a373655e8865728ec3109d1dcaa20dc187c","cross_cats_sorted":["cs.AR"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DC","submitted_at":"2025-11-13T01:41:47Z","title_canon_sha256":"3f2f23862e808398126cf576b91c3a515d18a5f7fc16dbc02343cb89c5f5ba21"},"schema_version":"1.0","source":{"id":"2511.09861","kind":"arxiv","version":3}},"canonical_sha256":"289151ce44f545d648ffea25fae057268f810a911bb84d17410b4f29641fd649","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"289151ce44f545d648ffea25fae057268f810a911bb84d17410b4f29641fd649","first_computed_at":"2026-05-18T03:09:33.292027Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:09:33.292027Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"1CYG5eSk8uAlxX4i8h4EeHnilW2z5k+4mlImnodpWyKkMecWGDlxYsslaw98yyLTCiizHpicaqdvRU4fxDksDQ==","signature_status":"signed_v1","signed_at":"2026-05-18T03:09:33.292749Z","signed_message":"canonical_sha256_bytes"},"source_id":"2511.09861","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2b8ca8d8d352eec19e912469e1736a099abd6becd22a93c2ee19e4a4f6acebf1","sha256:bf26df15ff2ec833559ec9c01733241bdde446ba90b3bcb8998826893743cbd6"],"state_sha256":"de74639f7c7133fb73675cf44e50978fb03a3fe60ee206b48167c7769f9a8c06"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ShnQEx3RE9j7Dgv7E5SIvDb1VA2wE5QuvUFIDEkH1XLzlVYx1jcNXuHEfXJAaRphxYS0D8CLiEopEGVKQ8X/BQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-18T20:00:33.236241Z","bundle_sha256":"15b95f7cd22f0bfe22571a98bc79faac168dcede7bd54545ba5a657d2885e5dc"}}