{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:MI552NCCD4HBZDJQTKS7H6AWNY","short_pith_number":"pith:MI552NCC","schema_version":"1.0","canonical_sha256":"623bdd34421f0e1c8d309aa5f3f8166e16ae884987bd2c9f7d8223bd2681ad63","source":{"kind":"arxiv","id":"2606.29708","version":1},"attestation_state":"computed","paper":{"title":"Demystifying the Design Space and Best Practices for Heterogeneous LLM Inference and Serving","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.DC","authors_text":"Dian Wang, Fangcheng Fu, He Liu, Hongzhou Zhang, Jinlong Hou, Jun Chen, Ping Zhang, Ruya Gu, Xiangbin Li, Xiangjun Huang, Xiaohe Hu, Xiaowei Shen, Yijie Chen, Yinhui Lu, Yuan Cheng, Zhengbo Wang, Zhixin Wang, Zhou Tan","submitted_at":"2026-06-29T02:24:13Z","abstract_excerpt":"Heterogeneous prefill-decode (PD) inference is now in production: prefill on cost-efficient or supply-available accelerators, decode on bandwidth-strong ones, and KV state crossing mixed interconnects in mixed numerical formats. Each deployment makes these decisions on its own. What is missing is the picture across configurations-which decisions must be made jointly at the PD boundary, and which can be made independently. We propose a design space organized along four design axes-accelerator, precision, interconnect, and KV residency and the workload regime (stage pressure) they respond to. We"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.29708","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DC","submitted_at":"2026-06-29T02:24:13Z","cross_cats_sorted":[],"title_canon_sha256":"8888f7c3a9d46bfe6d5170c7fb5a11b6ef1a56900346cf05d19e1cf386bb040c","abstract_canon_sha256":"ebc5c64436a0d9d8b4b33435081ba5f9ea1db8768ba1b4834e0ec281fd7d5d6a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T02:17:31.497947Z","signature_b64":"FN+I+Z+AtRIPzSAiKkRLnoDtUnclLkA0CINSP6Hi7VP3daMPLSyEJ1kI+lVWRxQCcFOkk3Y0cEtrJFB76Q5HAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"623bdd34421f0e1c8d309aa5f3f8166e16ae884987bd2c9f7d8223bd2681ad63","last_reissued_at":"2026-06-30T02:17:31.497492Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T02:17:31.497492Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Demystifying the Design Space and Best Practices for Heterogeneous LLM Inference and Serving","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.DC","authors_text":"Dian Wang, Fangcheng Fu, He Liu, Hongzhou Zhang, Jinlong Hou, Jun Chen, Ping Zhang, Ruya Gu, Xiangbin Li, Xiangjun Huang, Xiaohe Hu, Xiaowei Shen, Yijie Chen, Yinhui Lu, Yuan Cheng, Zhengbo Wang, Zhixin Wang, Zhou Tan","submitted_at":"2026-06-29T02:24:13Z","abstract_excerpt":"Heterogeneous prefill-decode (PD) inference is now in production: prefill on cost-efficient or supply-available accelerators, decode on bandwidth-strong ones, and KV state crossing mixed interconnects in mixed numerical formats. Each deployment makes these decisions on its own. What is missing is the picture across configurations-which decisions must be made jointly at the PD boundary, and which can be made independently. We propose a design space organized along four design axes-accelerator, precision, interconnect, and KV residency and the workload regime (stage pressure) they respond to. We"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.29708","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.29708/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.29708","created_at":"2026-06-30T02:17:31.497553+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.29708v1","created_at":"2026-06-30T02:17:31.497553+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.29708","created_at":"2026-06-30T02:17:31.497553+00:00"},{"alias_kind":"pith_short_12","alias_value":"MI552NCCD4HB","created_at":"2026-06-30T02:17:31.497553+00:00"},{"alias_kind":"pith_short_16","alias_value":"MI552NCCD4HBZDJQ","created_at":"2026-06-30T02:17:31.497553+00:00"},{"alias_kind":"pith_short_8","alias_value":"MI552NCC","created_at":"2026-06-30T02:17:31.497553+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MI552NCCD4HBZDJQTKS7H6AWNY","json":"https://pith.science/pith/MI552NCCD4HBZDJQTKS7H6AWNY.json","graph_json":"https://pith.science/api/pith-number/MI552NCCD4HBZDJQTKS7H6AWNY/graph.json","events_json":"https://pith.science/api/pith-number/MI552NCCD4HBZDJQTKS7H6AWNY/events.json","paper":"https://pith.science/paper/MI552NCC"},"agent_actions":{"view_html":"https://pith.science/pith/MI552NCCD4HBZDJQTKS7H6AWNY","download_json":"https://pith.science/pith/MI552NCCD4HBZDJQTKS7H6AWNY.json","view_paper":"https://pith.science/paper/MI552NCC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.29708&json=true","fetch_graph":"https://pith.science/api/pith-number/MI552NCCD4HBZDJQTKS7H6AWNY/graph.json","fetch_events":"https://pith.science/api/pith-number/MI552NCCD4HBZDJQTKS7H6AWNY/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MI552NCCD4HBZDJQTKS7H6AWNY/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MI552NCCD4HBZDJQTKS7H6AWNY/action/storage_attestation","attest_author":"https://pith.science/pith/MI552NCCD4HBZDJQTKS7H6AWNY/action/author_attestation","sign_citation":"https://pith.science/pith/MI552NCCD4HBZDJQTKS7H6AWNY/action/citation_signature","submit_replication":"https://pith.science/pith/MI552NCCD4HBZDJQTKS7H6AWNY/action/replication_record"}},"created_at":"2026-06-30T02:17:31.497553+00:00","updated_at":"2026-06-30T02:17:31.497553+00:00"}