{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:C75OY7FNMDYVQFC4XNHJNN525T","short_pith_number":"pith:C75OY7FN","schema_version":"1.0","canonical_sha256":"17faec7cad60f158145cbb4e96b7baecd22c2be3149c940e2e25447e55f12948","source":{"kind":"arxiv","id":"2401.15391","version":1},"attestation_state":"computed","paper":{"title":"MultiHop-RAG: Benchmarking Retrieval-Augmented Generation for Multi-Hop Queries","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"Existing RAG systems are inadequate for answering multi-hop queries that require retrieving and reasoning over multiple pieces of evidence.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Yixuan Tang, Yi Yang","submitted_at":"2024-01-27T11:41:48Z","abstract_excerpt":"Retrieval-augmented generation (RAG) augments large language models (LLM) by retrieving relevant knowledge, showing promising potential in mitigating LLM hallucinations and enhancing response quality, thereby facilitating the great adoption of LLMs in practice. However, we find that existing RAG systems are inadequate in answering multi-hop queries, which require retrieving and reasoning over multiple pieces of supporting evidence. Furthermore, to our knowledge, no existing RAG benchmarking dataset focuses on multi-hop queries. In this paper, we develop a novel dataset, MultiHop-RAG, which con"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2401.15391","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2024-01-27T11:41:48Z","cross_cats_sorted":[],"title_canon_sha256":"c55454af3cb3673b746ca7a8f066c0e27a37f01b4e8c4c266b1be07fd08255de","abstract_canon_sha256":"8e3a288a6d0114362e20405f60960423603dacd29df453504f4c2fa815a03da6"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:52.399847Z","signature_b64":"M8ob6d0I5XTYJrXX/9Qbhso/XwbPDOpdNJLMHFxBu/uFfNzxXAr7GqHBEy82l3dLDuwFvRDjILYY+B8g7jrCBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"17faec7cad60f158145cbb4e96b7baecd22c2be3149c940e2e25447e55f12948","last_reissued_at":"2026-05-17T23:38:52.399353Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:52.399353Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"MultiHop-RAG: Benchmarking Retrieval-Augmented Generation for Multi-Hop Queries","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"Existing RAG systems are inadequate for answering multi-hop queries that require retrieving and reasoning over multiple pieces of evidence.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Yixuan Tang, Yi Yang","submitted_at":"2024-01-27T11:41:48Z","abstract_excerpt":"Retrieval-augmented generation (RAG) augments large language models (LLM) by retrieving relevant knowledge, showing promising potential in mitigating LLM hallucinations and enhancing response quality, thereby facilitating the great adoption of LLMs in practice. However, we find that existing RAG systems are inadequate in answering multi-hop queries, which require retrieving and reasoning over multiple pieces of supporting evidence. Furthermore, to our knowledge, no existing RAG benchmarking dataset focuses on multi-hop queries. In this paper, we develop a novel dataset, MultiHop-RAG, which con"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"existing RAG systems are inadequate in answering multi-hop queries, which require retrieving and reasoning over multiple pieces of supporting evidence.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The multi-hop queries constructed from the English news article dataset accurately reflect the distribution and difficulty of real-world multi-hop queries that users would ask RAG systems.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"MultiHop-RAG is a new benchmark dataset demonstrating that existing retrieval-augmented generation systems perform poorly on multi-hop queries requiring retrieval and reasoning over multiple evidence pieces.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Existing RAG systems are inadequate for answering multi-hop queries that require retrieving and reasoning over multiple pieces of evidence.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"db2b79536c04f2a262096a0deee1b0800ed16725b4f3a3a135f3a34e1c6f871e"},"source":{"id":"2401.15391","kind":"arxiv","version":1},"verdict":{"id":"8be6379a-3ddc-4466-90b1-cba1db8d1fdd","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T13:49:12.581967Z","strongest_claim":"existing RAG systems are inadequate in answering multi-hop queries, which require retrieving and reasoning over multiple pieces of supporting evidence.","one_line_summary":"MultiHop-RAG is a new benchmark dataset demonstrating that existing retrieval-augmented generation systems perform poorly on multi-hop queries requiring retrieval and reasoning over multiple evidence pieces.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The multi-hop queries constructed from the English news article dataset accurately reflect the distribution and difficulty of real-world multi-hop queries that users would ask RAG systems.","pith_extraction_headline":"Existing RAG systems are inadequate for answering multi-hop queries that require retrieving and reasoning over multiple pieces of evidence."},"references":{"count":296,"sample":[{"doi":"","year":2023,"title":"Anthropic. 2023. Claude 2.1 ( May version). https://api.anthropic.com/v1/messages. Claude 2.1","work_id":"e23e2e37-8d00-42b9-b780-2d3e46fd5287","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Akari Asai, Sewon Min, Zexuan Zhong, and Danqi Chen. 2023. Retrieval-based language models and applications. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics ","work_id":"962bc87e-4a60-497f-9674-e165975c5280","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Sebastian Borgeaud, Arthur Mensch, Jordan Hoffmann, Trevor Cai, Eliza Rutherford, Katie Millican, George Bm Van Den Driessche, Jean-Baptiste Lespiau, Bogdan Damoc, Aidan Clark, Diego De Las Casas, Aur","work_id":"a5d01995-b562-4bda-b925-346dea795b67","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Harrison Chase. 2022. https://github.com/langchain-ai/langchain LangChain","work_id":"f2314057-ba9b-49f0-8722-89abbbb435c0","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Benchmarking large language models in retrieval-augmented generation.arXiv preprint arXiv:2309.01431","work_id":"0cedce0f-ffd3-4107-b96a-00f53c8c3d37","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":296,"snapshot_sha256":"78779bff3b8eb3b0f49a10b84cd196b28d7ae39819ed6b3697bd62459db339e7","internal_anchors":6},"formal_canon":{"evidence_count":1,"snapshot_sha256":"b263552db690db56a6221e9260a68f3b5cf118c3e9be41621d8051a8dbac35df"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2401.15391","created_at":"2026-05-17T23:38:52.399434+00:00"},{"alias_kind":"arxiv_version","alias_value":"2401.15391v1","created_at":"2026-05-17T23:38:52.399434+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.15391","created_at":"2026-05-17T23:38:52.399434+00:00"},{"alias_kind":"pith_short_12","alias_value":"C75OY7FNMDYV","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"C75OY7FNMDYVQFC4","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"C75OY7FN","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":25,"internal_anchor_count":25,"sample":[{"citing_arxiv_id":"2410.05970","citing_title":"PDF-WuKong: A Large Multimodal Model for Efficient Long PDF Reading with End-to-End Sparse Sampling","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2502.09891","citing_title":"ArchRAG: Attributed Community-based Hierarchical Retrieval-Augmented Generation","ref_index":54,"is_internal_anchor":true},{"citing_arxiv_id":"2503.04338","citing_title":"In-depth Analysis of Graph-based RAG in a Unified Framework","ref_index":80,"is_internal_anchor":true},{"citing_arxiv_id":"2603.14828","citing_title":"Toward Robust GraphRAG: Mitigating Retrieval Drift and Hallucination from Imperfect Knowledge Graphs","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15156","citing_title":"MeMo: Memory as a Model","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2604.16310","citing_title":"RAG-DIVE: A Dynamic Approach for Multi-Turn Dialogue Evaluation in Retrieval-Augmented Generation","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2402.19473","citing_title":"Retrieval-Augmented Generation for AI-Generated Content: A Survey","ref_index":197,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14192","citing_title":"Why Retrieval-Augmented Generation Fails: A Graph Perspective","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15156","citing_title":"MeMo: Memory as a Model","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14170","citing_title":"Stateful Evidence-Driven Retrieval-Augmented Generation with Iterative Reasoning","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02640","citing_title":"Overcoming the \"Impracticality\" of RAG: Proposing a Real-World Benchmark and Multi-Dimensional Diagnostic Framework","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08581","citing_title":"PRISM: Fast Online LLM Serving via Scheduling-Memory Co-design","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10168","citing_title":"ASTRA-QA: A Benchmark for Abstract Question Answering over Documents","ref_index":40,"is_internal_anchor":true},{"citing_arxiv_id":"2604.25906","citing_title":"Make Any Collection Navigable: Methods for Constructing and Evaluating Hypergraph of Text","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23783","citing_title":"S2G-RAG: Structured Sufficiency and Gap Judging for Iterative Retrieval-Augmented QA","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.04523","citing_title":"RaguTeam at SemEval-2026 Task 8: Meno and Friends in a Judge-Orchestrated LLM Ensemble for Faithful Multi-Turn Response Generation","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2605.01495","citing_title":"FT-RAG: A Fine-grained Retrieval-Augmented Generation Framework for Complex Table Reasoning","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2604.19899","citing_title":"A Reproducibility Study of Metacognitive Retrieval-Augmented Generation","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10426","citing_title":"CodaRAG: Connecting the Dots with Associativity Inspired by Complementary Learning","ref_index":40,"is_internal_anchor":true},{"citing_arxiv_id":"2404.16130","citing_title":"From Local to Global: A Graph RAG Approach to Query-Focused Summarization","ref_index":60,"is_internal_anchor":true},{"citing_arxiv_id":"2604.17943","citing_title":"A Benchmark Construction and Evaluation Framework for Specialist Domains: Case Study on Defense-related Documents","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15771","citing_title":"Skill-RAG: Failure-State-Aware Retrieval Augmentation via Hidden-State Probing and Skill Routing","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15676","citing_title":"EvoRAG: Making Knowledge Graph-based RAG Automatically Evolve through Feedback-driven Backpropagation","ref_index":79,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18234","citing_title":"Evaluating Multi-Hop Reasoning in RAG Systems: A Comparison of LLM-Based Retriever Evaluation Strategies","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2604.20572","citing_title":"Ask Only When Needed: Proactive Retrieval from Memory and Skills for Experience-Driven Lifelong Agents","ref_index":15,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":1,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/C75OY7FNMDYVQFC4XNHJNN525T","json":"https://pith.science/pith/C75OY7FNMDYVQFC4XNHJNN525T.json","graph_json":"https://pith.science/api/pith-number/C75OY7FNMDYVQFC4XNHJNN525T/graph.json","events_json":"https://pith.science/api/pith-number/C75OY7FNMDYVQFC4XNHJNN525T/events.json","paper":"https://pith.science/paper/C75OY7FN"},"agent_actions":{"view_html":"https://pith.science/pith/C75OY7FNMDYVQFC4XNHJNN525T","download_json":"https://pith.science/pith/C75OY7FNMDYVQFC4XNHJNN525T.json","view_paper":"https://pith.science/paper/C75OY7FN","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2401.15391&json=true","fetch_graph":"https://pith.science/api/pith-number/C75OY7FNMDYVQFC4XNHJNN525T/graph.json","fetch_events":"https://pith.science/api/pith-number/C75OY7FNMDYVQFC4XNHJNN525T/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/C75OY7FNMDYVQFC4XNHJNN525T/action/timestamp_anchor","attest_storage":"https://pith.science/pith/C75OY7FNMDYVQFC4XNHJNN525T/action/storage_attestation","attest_author":"https://pith.science/pith/C75OY7FNMDYVQFC4XNHJNN525T/action/author_attestation","sign_citation":"https://pith.science/pith/C75OY7FNMDYVQFC4XNHJNN525T/action/citation_signature","submit_replication":"https://pith.science/pith/C75OY7FNMDYVQFC4XNHJNN525T/action/replication_record"}},"created_at":"2026-05-17T23:38:52.399434+00:00","updated_at":"2026-05-17T23:38:52.399434+00:00"}