{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:7YXM6JRYOH4O5NBMCHKEPDPSW2","short_pith_number":"pith:7YXM6JRY","canonical_record":{"source":{"id":"2404.14294","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-04-22T15:53:08Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"0158e010d7858a65e7781dd03ec62b813bbae982fd020a8150281fd273403c03","abstract_canon_sha256":"7e45755716429abd0dc0e09cd3eff786a25f8857d55ccb1a7e23f2fa7d08b786"},"schema_version":"1.0"},"canonical_sha256":"fe2ecf263871f8eeb42c11d4478df2b69b77748d33f8d92acab2b44d81666059","source":{"kind":"arxiv","id":"2404.14294","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2404.14294","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2404.14294v3","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2404.14294","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"7YXM6JRYOH4O","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"7YXM6JRYOH4O5NBM","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"7YXM6JRY","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:7YXM6JRYOH4O5NBMCHKEPDPSW2","target":"record","payload":{"canonical_record":{"source":{"id":"2404.14294","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-04-22T15:53:08Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"0158e010d7858a65e7781dd03ec62b813bbae982fd020a8150281fd273403c03","abstract_canon_sha256":"7e45755716429abd0dc0e09cd3eff786a25f8857d55ccb1a7e23f2fa7d08b786"},"schema_version":"1.0"},"canonical_sha256":"fe2ecf263871f8eeb42c11d4478df2b69b77748d33f8d92acab2b44d81666059","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.799025Z","signature_b64":"CyNTe/J2ssLwivPolec0iaoEw0/jBibZH7YlVEsxJzBzTkoqsiGhYAO19D3Zt4q82iU38Dy45hiqntBKvSSCAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fe2ecf263871f8eeb42c11d4478df2b69b77748d33f8d92acab2b44d81666059","last_reissued_at":"2026-05-17T23:38:53.798407Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.798407Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2404.14294","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"gBFix1fZda2UWUQFeeCkMk2ARYDlEsbeBJ5aNdkEIyhuYzHkTpxVOiPeuOgw6ZY32Hu+h8LYk6eqYCQPaUMgDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T11:20:51.575421Z"},"content_sha256":"c29852246076b7535a4ac89e32471e3f00765e0c112c6220252668ae337a72e0","schema_version":"1.0","event_id":"sha256:c29852246076b7535a4ac89e32471e3f00765e0c112c6220252668ae337a72e0"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:7YXM6JRYOH4O5NBMCHKEPDPSW2","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"A Survey on Efficient Inference for Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A survey organizes methods for efficient large language model inference into data-level, model-level, and system-level categories and benchmarks representative techniques.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Guohao Dai, Jiaming Xu, Ke Hong, Luning Wang, Shengen Yan, Shiyao Li, Tianyu Fu, Xiao-Ping Zhang, Xiuhong Li, Xuefei Ning, Yuhan Dong, Yuming Lou, Yu Wang, Zhihang Yuan, Zixuan Zhou","submitted_at":"2024-04-22T15:53:08Z","abstract_excerpt":"Large Language Models (LLMs) have attracted extensive attention due to their remarkable performance across various tasks. However, the substantial computational and memory requirements of LLM inference pose challenges for deployment in resource-constrained scenarios. Efforts within the field have been directed towards developing techniques aimed at enhancing the efficiency of LLM inference. This paper presents a comprehensive survey of the existing literature on efficient LLM inference. We start by analyzing the primary causes of the inefficient LLM inference, i.e., the large model size, the q"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"This paper presents a comprehensive survey of the existing literature on efficient LLM inference... organized into data-level, model-level, and system-level optimization... with comparative experiments on representative methods.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the chosen representative methods and experimental comparisons fairly represent the broader literature and yield generalizable quantitative insights without significant selection bias.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"The paper surveys techniques to speed up and reduce the resource needs of LLM inference, organized by data-level, model-level, and system-level changes, with comparative experiments on representative methods.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A survey organizes methods for efficient large language model inference into data-level, model-level, and system-level categories and benchmarks representative techniques.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"db7eec77f23bcee3f3c597a53f5ac8216a6eacdb8a60e15dfd01e928160a1905"},"source":{"id":"2404.14294","kind":"arxiv","version":3},"verdict":{"id":"1ab38c5d-7fdb-4953-a163-f60d1ae7e089","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T02:36:18.313400Z","strongest_claim":"This paper presents a comprehensive survey of the existing literature on efficient LLM inference... organized into data-level, model-level, and system-level optimization... with comparative experiments on representative methods.","one_line_summary":"The paper surveys techniques to speed up and reduce the resource needs of LLM inference, organized by data-level, model-level, and system-level changes, with comparative experiments on representative methods.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the chosen representative methods and experimental comparisons fairly represent the broader literature and yield generalizable quantitative insights without significant selection bias.","pith_extraction_headline":"A survey organizes methods for efficient large language model inference into data-level, model-level, and system-level categories and benchmarks representative techniques."},"references":{"count":298,"sample":[{"doi":"","year":2018,"title":"Improving language understanding by generative pre-training,","work_id":"a7a0f0e5-46ea-4c45-916e-10a354ef7a75","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2019,"title":"Language models are unsupervised multitask learners","work_id":"9fb276fb-e836-4b02-aa1b-f31321e69d94","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1901,"title":"Language models are few-shot learners","work_id":"ba44e148-856c-498e-aded-be65cf943446","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"OPT: Open Pre-trained Transformer Language Models","work_id":"d7ff3b21-1fff-4cf4-952a-4714e3ef2307","ref_index":4,"cited_arxiv_id":"2205.01068","is_internal_anchor":true},{"doi":"","year":2023,"title":"Baichuan 2: Open large-scale language models","work_id":"9ba8f898-3900-4776-b82e-11e767a86ba9","ref_index":6,"cited_arxiv_id":"2309.10305","is_internal_anchor":false}],"resolved_work":298,"snapshot_sha256":"3371180055fbfce2246d8816adb0c736ac16d95c49f32ea8e91bc7b5961557a5","internal_anchors":41},"formal_canon":{"evidence_count":2,"snapshot_sha256":"c945af8a0d0aa36253f04d5fc6ccb3ba31d21c787614f8283ddfc3ef053a6a17"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"1ab38c5d-7fdb-4953-a163-f60d1ae7e089"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"geSpfxdgZ7DnwBtH4fMH1p4Ww/zqUbm45NSc78iI9vqVNXFhceDcNwzTzzW5DOjQEi31p37rljM1L2mBFeTsAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T11:20:51.575999Z"},"content_sha256":"5b4f4b444bd6e0aa9489d3806e0bba725d2fb4c0f8dba248c413f6a5f90635e2","schema_version":"1.0","event_id":"sha256:5b4f4b444bd6e0aa9489d3806e0bba725d2fb4c0f8dba248c413f6a5f90635e2"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/7YXM6JRYOH4O5NBMCHKEPDPSW2/bundle.json","state_url":"https://pith.science/pith/7YXM6JRYOH4O5NBMCHKEPDPSW2/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/7YXM6JRYOH4O5NBMCHKEPDPSW2/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T11:20:51Z","links":{"resolver":"https://pith.science/pith/7YXM6JRYOH4O5NBMCHKEPDPSW2","bundle":"https://pith.science/pith/7YXM6JRYOH4O5NBMCHKEPDPSW2/bundle.json","state":"https://pith.science/pith/7YXM6JRYOH4O5NBMCHKEPDPSW2/state.json","well_known_bundle":"https://pith.science/.well-known/pith/7YXM6JRYOH4O5NBMCHKEPDPSW2/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:7YXM6JRYOH4O5NBMCHKEPDPSW2","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7e45755716429abd0dc0e09cd3eff786a25f8857d55ccb1a7e23f2fa7d08b786","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-04-22T15:53:08Z","title_canon_sha256":"0158e010d7858a65e7781dd03ec62b813bbae982fd020a8150281fd273403c03"},"schema_version":"1.0","source":{"id":"2404.14294","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2404.14294","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2404.14294v3","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2404.14294","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"7YXM6JRYOH4O","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"7YXM6JRYOH4O5NBM","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"7YXM6JRY","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:5b4f4b444bd6e0aa9489d3806e0bba725d2fb4c0f8dba248c413f6a5f90635e2","target":"graph","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"This paper presents a comprehensive survey of the existing literature on efficient LLM inference... organized into data-level, model-level, and system-level optimization... with comparative experiments on representative methods."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the chosen representative methods and experimental comparisons fairly represent the broader literature and yield generalizable quantitative insights without significant selection bias."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"The paper surveys techniques to speed up and reduce the resource needs of LLM inference, organized by data-level, model-level, and system-level changes, with comparative experiments on representative methods."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A survey organizes methods for efficient large language model inference into data-level, model-level, and system-level categories and benchmarks representative techniques."}],"snapshot_sha256":"db7eec77f23bcee3f3c597a53f5ac8216a6eacdb8a60e15dfd01e928160a1905"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"c945af8a0d0aa36253f04d5fc6ccb3ba31d21c787614f8283ddfc3ef053a6a17"},"paper":{"abstract_excerpt":"Large Language Models (LLMs) have attracted extensive attention due to their remarkable performance across various tasks. However, the substantial computational and memory requirements of LLM inference pose challenges for deployment in resource-constrained scenarios. Efforts within the field have been directed towards developing techniques aimed at enhancing the efficiency of LLM inference. This paper presents a comprehensive survey of the existing literature on efficient LLM inference. We start by analyzing the primary causes of the inefficient LLM inference, i.e., the large model size, the q","authors_text":"Guohao Dai, Jiaming Xu, Ke Hong, Luning Wang, Shengen Yan, Shiyao Li, Tianyu Fu, Xiao-Ping Zhang, Xiuhong Li, Xuefei Ning, Yuhan Dong, Yuming Lou, Yu Wang, Zhihang Yuan, Zixuan Zhou","cross_cats":["cs.AI"],"headline":"A survey organizes methods for efficient large language model inference into data-level, model-level, and system-level categories and benchmarks representative techniques.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-04-22T15:53:08Z","title":"A Survey on Efficient Inference for Large Language Models"},"references":{"count":298,"internal_anchors":41,"resolved_work":298,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Improving language understanding by generative pre-training,","work_id":"a7a0f0e5-46ea-4c45-916e-10a354ef7a75","year":2018},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Language models are unsupervised multitask learners","work_id":"9fb276fb-e836-4b02-aa1b-f31321e69d94","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Language models are few-shot learners","work_id":"ba44e148-856c-498e-aded-be65cf943446","year":1901},{"cited_arxiv_id":"2205.01068","doi":"","is_internal_anchor":true,"ref_index":4,"title":"OPT: Open Pre-trained Transformer Language Models","work_id":"d7ff3b21-1fff-4cf4-952a-4714e3ef2307","year":2022},{"cited_arxiv_id":"2309.10305","doi":"","is_internal_anchor":false,"ref_index":6,"title":"Baichuan 2: Open large-scale language models","work_id":"9ba8f898-3900-4776-b82e-11e767a86ba9","year":2023}],"snapshot_sha256":"3371180055fbfce2246d8816adb0c736ac16d95c49f32ea8e91bc7b5961557a5"},"source":{"id":"2404.14294","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-15T02:36:18.313400Z","id":"1ab38c5d-7fdb-4953-a163-f60d1ae7e089","model_set":{"reader":"grok-4.3"},"one_line_summary":"The paper surveys techniques to speed up and reduce the resource needs of LLM inference, organized by data-level, model-level, and system-level changes, with comparative experiments on representative methods.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A survey organizes methods for efficient large language model inference into data-level, model-level, and system-level categories and benchmarks representative techniques.","strongest_claim":"This paper presents a comprehensive survey of the existing literature on efficient LLM inference... organized into data-level, model-level, and system-level optimization... with comparative experiments on representative methods.","weakest_assumption":"That the chosen representative methods and experimental comparisons fairly represent the broader literature and yield generalizable quantitative insights without significant selection bias."}},"verdict_id":"1ab38c5d-7fdb-4953-a163-f60d1ae7e089"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:c29852246076b7535a4ac89e32471e3f00765e0c112c6220252668ae337a72e0","target":"record","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7e45755716429abd0dc0e09cd3eff786a25f8857d55ccb1a7e23f2fa7d08b786","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-04-22T15:53:08Z","title_canon_sha256":"0158e010d7858a65e7781dd03ec62b813bbae982fd020a8150281fd273403c03"},"schema_version":"1.0","source":{"id":"2404.14294","kind":"arxiv","version":3}},"canonical_sha256":"fe2ecf263871f8eeb42c11d4478df2b69b77748d33f8d92acab2b44d81666059","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"fe2ecf263871f8eeb42c11d4478df2b69b77748d33f8d92acab2b44d81666059","first_computed_at":"2026-05-17T23:38:53.798407Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:53.798407Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"CyNTe/J2ssLwivPolec0iaoEw0/jBibZH7YlVEsxJzBzTkoqsiGhYAO19D3Zt4q82iU38Dy45hiqntBKvSSCAQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:53.799025Z","signed_message":"canonical_sha256_bytes"},"source_id":"2404.14294","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:c29852246076b7535a4ac89e32471e3f00765e0c112c6220252668ae337a72e0","sha256:5b4f4b444bd6e0aa9489d3806e0bba725d2fb4c0f8dba248c413f6a5f90635e2"],"state_sha256":"33ab947bb59ab9c9c562fa588607b0fd3c525c7b50f6160413167f10b1a0456e"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"5Wib5goHEKJ99f8qqXbNeoff+DZNd8L9cZLV/leHX9FUafGAoJ+EJuFOTCWmKGqIG67bl6owDC78g1iC/+T4Dw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T11:20:51.579540Z","bundle_sha256":"501c855be78265e11b4dc2d4910ded740b453e1a690fa0a6ad73bec3616ef598"}}