{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:VCY6QYJUC5D74CIWMFTRABMMZ5","short_pith_number":"pith:VCY6QYJU","schema_version":"1.0","canonical_sha256":"a8b1e861341747fe0916616710058ccf4997b1623515699aa71e7179b61727a4","source":{"kind":"arxiv","id":"2410.18248","version":2},"attestation_state":"computed","paper":{"title":"Fast Inference for Augmented Large Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Cong Liang, Michael Mitzenmacher, Minlan Yu, Qianru Lao, Rana Shahout, Shiji Xin, Yong Cui","submitted_at":"2024-10-23T19:53:30Z","abstract_excerpt":"Augmented Large Language Models (LLMs) enhance the capabilities of standalone LLMs by integrating external data sources through API calls. In interactive LLM applications, efficient scheduling is crucial for maintaining low request completion times, directly impacting user engagement. However, these augmentations introduce scheduling challenges due to the need to manage limited memory for cached information (KV caches). As a result, traditional size-based scheduling algorithms, such as Shortest Job First (SJF), become less effective at minimizing completion times. Existing work focuses only on"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2410.18248","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2024-10-23T19:53:30Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"570050746df1725cae93c0f40340a51e5abe853293e97d4ad417169e5e38d6c3","abstract_canon_sha256":"cf2b5cabaeff265fe6e74cfa5aaa94417001afc6d92bfec69b2b997d7ae76052"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T09:26:31.922912Z","signature_b64":"WEfXayVOGHiSbkNXOP1fr9RrwOwI3o8r46RGFiVPOhaftwEW9A9IaDn6fl2dnzc5Ulm1kC7te0Nwkiht19DIAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a8b1e861341747fe0916616710058ccf4997b1623515699aa71e7179b61727a4","last_reissued_at":"2026-07-05T09:26:31.922425Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T09:26:31.922425Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Fast Inference for Augmented Large Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Cong Liang, Michael Mitzenmacher, Minlan Yu, Qianru Lao, Rana Shahout, Shiji Xin, Yong Cui","submitted_at":"2024-10-23T19:53:30Z","abstract_excerpt":"Augmented Large Language Models (LLMs) enhance the capabilities of standalone LLMs by integrating external data sources through API calls. In interactive LLM applications, efficient scheduling is crucial for maintaining low request completion times, directly impacting user engagement. However, these augmentations introduce scheduling challenges due to the need to manage limited memory for cached information (KV caches). As a result, traditional size-based scheduling algorithms, such as Shortest Job First (SJF), become less effective at minimizing completion times. Existing work focuses only on"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2410.18248","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2410.18248/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2410.18248","created_at":"2026-07-05T09:26:31.922483+00:00"},{"alias_kind":"arxiv_version","alias_value":"2410.18248v2","created_at":"2026-07-05T09:26:31.922483+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.18248","created_at":"2026-07-05T09:26:31.922483+00:00"},{"alias_kind":"pith_short_12","alias_value":"VCY6QYJUC5D7","created_at":"2026-07-05T09:26:31.922483+00:00"},{"alias_kind":"pith_short_16","alias_value":"VCY6QYJUC5D74CIW","created_at":"2026-07-05T09:26:31.922483+00:00"},{"alias_kind":"pith_short_8","alias_value":"VCY6QYJU","created_at":"2026-07-05T09:26:31.922483+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2605.02329","citing_title":"Taming Request Imbalance: SLO-Aware Scheduling for Disaggregated LLM Inference","ref_index":8,"is_internal_anchor":false},{"citing_arxiv_id":"2605.21427","citing_title":"PALS: Power-Aware LLM Serving for Mixture-of-Experts Models","ref_index":33,"is_internal_anchor":false},{"citing_arxiv_id":"2605.02329","citing_title":"Taming Request Imbalance: SLO-Aware Scheduling for Disaggregated LLM Inference","ref_index":8,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VCY6QYJUC5D74CIWMFTRABMMZ5","json":"https://pith.science/pith/VCY6QYJUC5D74CIWMFTRABMMZ5.json","graph_json":"https://pith.science/api/pith-number/VCY6QYJUC5D74CIWMFTRABMMZ5/graph.json","events_json":"https://pith.science/api/pith-number/VCY6QYJUC5D74CIWMFTRABMMZ5/events.json","paper":"https://pith.science/paper/VCY6QYJU"},"agent_actions":{"view_html":"https://pith.science/pith/VCY6QYJUC5D74CIWMFTRABMMZ5","download_json":"https://pith.science/pith/VCY6QYJUC5D74CIWMFTRABMMZ5.json","view_paper":"https://pith.science/paper/VCY6QYJU","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2410.18248&json=true","fetch_graph":"https://pith.science/api/pith-number/VCY6QYJUC5D74CIWMFTRABMMZ5/graph.json","fetch_events":"https://pith.science/api/pith-number/VCY6QYJUC5D74CIWMFTRABMMZ5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VCY6QYJUC5D74CIWMFTRABMMZ5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VCY6QYJUC5D74CIWMFTRABMMZ5/action/storage_attestation","attest_author":"https://pith.science/pith/VCY6QYJUC5D74CIWMFTRABMMZ5/action/author_attestation","sign_citation":"https://pith.science/pith/VCY6QYJUC5D74CIWMFTRABMMZ5/action/citation_signature","submit_replication":"https://pith.science/pith/VCY6QYJUC5D74CIWMFTRABMMZ5/action/replication_record"}},"created_at":"2026-07-05T09:26:31.922483+00:00","updated_at":"2026-07-05T09:26:31.922483+00:00"}