{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:G2HV7X5SI6GH6AJLMEM6PUEZV7","short_pith_number":"pith:G2HV7X5S","canonical_record":{"source":{"id":"2604.07472","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-04-08T18:11:09Z","cross_cats_sorted":["cs.NI"],"title_canon_sha256":"ce8c58841fc5f42296e173de0e406dc8e12c46ab2d8f60fbfb7f23cf04c3b5f7","abstract_canon_sha256":"07b14608b034a7551d224c6d8a56905cbeb5c96a611f5d0543c1910bdc87a78a"},"schema_version":"1.0"},"canonical_sha256":"368f5fdfb2478c7f012b6119e7d099afeaf48e1e452748920c2eefea4ecfb9e0","source":{"kind":"arxiv","id":"2604.07472","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.07472","created_at":"2026-06-08T01:04:04Z"},{"alias_kind":"arxiv_version","alias_value":"2604.07472v2","created_at":"2026-06-08T01:04:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.07472","created_at":"2026-06-08T01:04:04Z"},{"alias_kind":"pith_short_12","alias_value":"G2HV7X5SI6GH","created_at":"2026-06-08T01:04:04Z"},{"alias_kind":"pith_short_16","alias_value":"G2HV7X5SI6GH6AJL","created_at":"2026-06-08T01:04:04Z"},{"alias_kind":"pith_short_8","alias_value":"G2HV7X5S","created_at":"2026-06-08T01:04:04Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:G2HV7X5SI6GH6AJLMEM6PUEZV7","target":"record","payload":{"canonical_record":{"source":{"id":"2604.07472","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-04-08T18:11:09Z","cross_cats_sorted":["cs.NI"],"title_canon_sha256":"ce8c58841fc5f42296e173de0e406dc8e12c46ab2d8f60fbfb7f23cf04c3b5f7","abstract_canon_sha256":"07b14608b034a7551d224c6d8a56905cbeb5c96a611f5d0543c1910bdc87a78a"},"schema_version":"1.0"},"canonical_sha256":"368f5fdfb2478c7f012b6119e7d099afeaf48e1e452748920c2eefea4ecfb9e0","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-08T01:04:04.524419Z","signature_b64":"Rc6PTSd4QHejSBHEr0bY5137O7xc8G31Nsns7J7XWZnf6FwmiTVyi2yB0I0jt47MVt8v5X09hch57gQbL+HJCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"368f5fdfb2478c7f012b6119e7d099afeaf48e1e452748920c2eefea4ecfb9e0","last_reissued_at":"2026-06-08T01:04:04.522975Z","signature_status":"signed_v1","first_computed_at":"2026-06-08T01:04:04.522975Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2604.07472","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-08T01:04:04Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"o85kepFggCXThf8NmEsP0HQDq1JwATrDOX6nUu3/Ks3RPdCbMAqDyQMrIMyVfsvluMa2gf8TS+Wvq3fpcFSsCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-08T22:16:49.112763Z"},"content_sha256":"f43775f81c4ae388de0280d9ebf2189064601b2e841aa692752827dc3d27b1da","schema_version":"1.0","event_id":"sha256:f43775f81c4ae388de0280d9ebf2189064601b2e841aa692752827dc3d27b1da"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:G2HV7X5SI6GH6AJLMEM6PUEZV7","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Scalable Joint Resource Allocation for SLO-Constrained LLM Inference in Heterogeneous GPU Clouds","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Two heuristics allocate mixed-scale LLMs on heterogeneous GPUs in under one second while meeting SLOs and approaching optimal cost.","cross_cats":["cs.NI"],"primary_cat":"cs.LG","authors_text":"Duong Tung Nguyen, Jiaming Cheng","submitted_at":"2026-04-08T18:11:09Z","abstract_excerpt":"Serving large language model (LLM) inference in cloud environments requires jointly optimizing model selection, GPU provisioning, parallelism configuration, and workload routing under latency, accuracy, memory, and budget constraints. While mixed-integer linear programming (MILP) can model this problem, its computational cost limits frequent re-optimization under demand variability. Existing heuristics often optimize individual components separately and may become infeasible when system-wide constraints are enforced.\n  This paper presents a scalable framework for SLO-constrained LLM inference."},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"On workloads calibrated with the Azure LLM Inference Trace (2025), both heuristics produce feasible solutions in under one second, with AGH closely approaching optimal cost while achieving over 260x speedup on large-scale instances. Under out-of-sample stress tests with up to 1.5x parameter inflation, AGH maintains controlled SLO violations and stable cost.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the three constraint-aware mechanisms (TP-aware feasibility selection, cost-per-effective-coverage ranking, and TP upgrade) can always produce feasible allocations under the tightly coupled memory, delay, error, and budget constraints without missing real-world edge cases.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Two constraint-aware greedy heuristics (GH and AGH) solve mixed-scale LLM allocation on heterogeneous GPUs under SLO constraints in under one second with over 260x speedup and near-optimal cost compared to exact MILP.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Two heuristics allocate mixed-scale LLMs on heterogeneous GPUs in under one second while meeting SLOs and approaching optimal cost.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"68e8ad60b8f8dbaa1bc15519cff27ef888ed3b6d1aac7688b21177d44009746c"},"source":{"id":"2604.07472","kind":"arxiv","version":2},"verdict":{"id":"dba110d6-48a9-4dd7-8e7a-f2f46fd7f534","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-10T17:57:20.385200Z","strongest_claim":"On workloads calibrated with the Azure LLM Inference Trace (2025), both heuristics produce feasible solutions in under one second, with AGH closely approaching optimal cost while achieving over 260x speedup on large-scale instances. Under out-of-sample stress tests with up to 1.5x parameter inflation, AGH maintains controlled SLO violations and stable cost.","one_line_summary":"Two constraint-aware greedy heuristics (GH and AGH) solve mixed-scale LLM allocation on heterogeneous GPUs under SLO constraints in under one second with over 260x speedup and near-optimal cost compared to exact MILP.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the three constraint-aware mechanisms (TP-aware feasibility selection, cost-per-effective-coverage ranking, and TP upgrade) can always produce feasible allocations under the tightly coupled memory, delay, error, and budget constraints without missing real-world edge cases.","pith_extraction_headline":"Two heuristics allocate mixed-scale LLMs on heterogeneous GPUs in under one second while meeting SLOs and approaching optimal cost."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.07472/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"dba110d6-48a9-4dd7-8e7a-f2f46fd7f534"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-08T01:04:04Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"RpqB95z+J8n0lbLVa+E7G7SGqP0r2ZFs7HQI6eDBjUIObDzWdO49PhS2ESppRm81Hd7o0odQ0RCanxmh0omkAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-08T22:16:49.113219Z"},"content_sha256":"660172d19ebd160c897795bb2b44be69a2a1e75af0ff5ddef3cb3a7044feb6b8","schema_version":"1.0","event_id":"sha256:660172d19ebd160c897795bb2b44be69a2a1e75af0ff5ddef3cb3a7044feb6b8"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/G2HV7X5SI6GH6AJLMEM6PUEZV7/bundle.json","state_url":"https://pith.science/pith/G2HV7X5SI6GH6AJLMEM6PUEZV7/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/G2HV7X5SI6GH6AJLMEM6PUEZV7/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-08T22:16:49Z","links":{"resolver":"https://pith.science/pith/G2HV7X5SI6GH6AJLMEM6PUEZV7","bundle":"https://pith.science/pith/G2HV7X5SI6GH6AJLMEM6PUEZV7/bundle.json","state":"https://pith.science/pith/G2HV7X5SI6GH6AJLMEM6PUEZV7/state.json","well_known_bundle":"https://pith.science/.well-known/pith/G2HV7X5SI6GH6AJLMEM6PUEZV7/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:G2HV7X5SI6GH6AJLMEM6PUEZV7","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"07b14608b034a7551d224c6d8a56905cbeb5c96a611f5d0543c1910bdc87a78a","cross_cats_sorted":["cs.NI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-04-08T18:11:09Z","title_canon_sha256":"ce8c58841fc5f42296e173de0e406dc8e12c46ab2d8f60fbfb7f23cf04c3b5f7"},"schema_version":"1.0","source":{"id":"2604.07472","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.07472","created_at":"2026-06-08T01:04:04Z"},{"alias_kind":"arxiv_version","alias_value":"2604.07472v2","created_at":"2026-06-08T01:04:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.07472","created_at":"2026-06-08T01:04:04Z"},{"alias_kind":"pith_short_12","alias_value":"G2HV7X5SI6GH","created_at":"2026-06-08T01:04:04Z"},{"alias_kind":"pith_short_16","alias_value":"G2HV7X5SI6GH6AJL","created_at":"2026-06-08T01:04:04Z"},{"alias_kind":"pith_short_8","alias_value":"G2HV7X5S","created_at":"2026-06-08T01:04:04Z"}],"graph_snapshots":[{"event_id":"sha256:660172d19ebd160c897795bb2b44be69a2a1e75af0ff5ddef3cb3a7044feb6b8","target":"graph","created_at":"2026-06-08T01:04:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"On workloads calibrated with the Azure LLM Inference Trace (2025), both heuristics produce feasible solutions in under one second, with AGH closely approaching optimal cost while achieving over 260x speedup on large-scale instances. Under out-of-sample stress tests with up to 1.5x parameter inflation, AGH maintains controlled SLO violations and stable cost."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the three constraint-aware mechanisms (TP-aware feasibility selection, cost-per-effective-coverage ranking, and TP upgrade) can always produce feasible allocations under the tightly coupled memory, delay, error, and budget constraints without missing real-world edge cases."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Two constraint-aware greedy heuristics (GH and AGH) solve mixed-scale LLM allocation on heterogeneous GPUs under SLO constraints in under one second with over 260x speedup and near-optimal cost compared to exact MILP."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Two heuristics allocate mixed-scale LLMs on heterogeneous GPUs in under one second while meeting SLOs and approaching optimal cost."}],"snapshot_sha256":"68e8ad60b8f8dbaa1bc15519cff27ef888ed3b6d1aac7688b21177d44009746c"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2604.07472/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Serving large language model (LLM) inference in cloud environments requires jointly optimizing model selection, GPU provisioning, parallelism configuration, and workload routing under latency, accuracy, memory, and budget constraints. While mixed-integer linear programming (MILP) can model this problem, its computational cost limits frequent re-optimization under demand variability. Existing heuristics often optimize individual components separately and may become infeasible when system-wide constraints are enforced.\n  This paper presents a scalable framework for SLO-constrained LLM inference.","authors_text":"Duong Tung Nguyen, Jiaming Cheng","cross_cats":["cs.NI"],"headline":"Two heuristics allocate mixed-scale LLMs on heterogeneous GPUs in under one second while meeting SLOs and approaching optimal cost.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-04-08T18:11:09Z","title":"Scalable Joint Resource Allocation for SLO-Constrained LLM Inference in Heterogeneous GPU Clouds"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2604.07472","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-10T17:57:20.385200Z","id":"dba110d6-48a9-4dd7-8e7a-f2f46fd7f534","model_set":{"reader":"grok-4.3"},"one_line_summary":"Two constraint-aware greedy heuristics (GH and AGH) solve mixed-scale LLM allocation on heterogeneous GPUs under SLO constraints in under one second with over 260x speedup and near-optimal cost compared to exact MILP.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Two heuristics allocate mixed-scale LLMs on heterogeneous GPUs in under one second while meeting SLOs and approaching optimal cost.","strongest_claim":"On workloads calibrated with the Azure LLM Inference Trace (2025), both heuristics produce feasible solutions in under one second, with AGH closely approaching optimal cost while achieving over 260x speedup on large-scale instances. Under out-of-sample stress tests with up to 1.5x parameter inflation, AGH maintains controlled SLO violations and stable cost.","weakest_assumption":"That the three constraint-aware mechanisms (TP-aware feasibility selection, cost-per-effective-coverage ranking, and TP upgrade) can always produce feasible allocations under the tightly coupled memory, delay, error, and budget constraints without missing real-world edge cases."}},"verdict_id":"dba110d6-48a9-4dd7-8e7a-f2f46fd7f534"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f43775f81c4ae388de0280d9ebf2189064601b2e841aa692752827dc3d27b1da","target":"record","created_at":"2026-06-08T01:04:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"07b14608b034a7551d224c6d8a56905cbeb5c96a611f5d0543c1910bdc87a78a","cross_cats_sorted":["cs.NI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-04-08T18:11:09Z","title_canon_sha256":"ce8c58841fc5f42296e173de0e406dc8e12c46ab2d8f60fbfb7f23cf04c3b5f7"},"schema_version":"1.0","source":{"id":"2604.07472","kind":"arxiv","version":2}},"canonical_sha256":"368f5fdfb2478c7f012b6119e7d099afeaf48e1e452748920c2eefea4ecfb9e0","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"368f5fdfb2478c7f012b6119e7d099afeaf48e1e452748920c2eefea4ecfb9e0","first_computed_at":"2026-06-08T01:04:04.522975Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-08T01:04:04.522975Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"Rc6PTSd4QHejSBHEr0bY5137O7xc8G31Nsns7J7XWZnf6FwmiTVyi2yB0I0jt47MVt8v5X09hch57gQbL+HJCg==","signature_status":"signed_v1","signed_at":"2026-06-08T01:04:04.524419Z","signed_message":"canonical_sha256_bytes"},"source_id":"2604.07472","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f43775f81c4ae388de0280d9ebf2189064601b2e841aa692752827dc3d27b1da","sha256:660172d19ebd160c897795bb2b44be69a2a1e75af0ff5ddef3cb3a7044feb6b8"],"state_sha256":"ac3a430d790bbfa2ca6c750c615c01cb34c41a2c7a7cdbc2f8be807447cf6ec6"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"WPZ5/XLXvuGkr8O4+m3gnc+wlS19aSIxSkfgPQJzdXpXUuiZKmJTc4xG5UmdIrMUKzXV4/D1bojO2DsKkKXEBA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-08T22:16:49.115412Z","bundle_sha256":"dd33cf5b8d4969c5ec929ba832f5c4e16372cfffea509935ce97b30c620cb48d"}}