{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:NS3TKUHLE75KCFYPPQZFWZH7MD","short_pith_number":"pith:NS3TKUHL","canonical_record":{"source":{"id":"2605.14503","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SE","submitted_at":"2026-05-14T07:47:44Z","cross_cats_sorted":[],"title_canon_sha256":"2d9a2a7d20840e20242d6d4a50784bdef00b1b46df3cf6b8b31aee1e2c1d2057","abstract_canon_sha256":"eefc97159af8dfc774e669a4c648396f531565b4c582975463e78d529d0c725a"},"schema_version":"1.0"},"canonical_sha256":"6cb73550eb27faa1170f7c325b64ff60ea01fa8ebf2e13632b93574984016f39","source":{"kind":"arxiv","id":"2605.14503","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14503","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14503v1","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14503","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"pith_short_12","alias_value":"NS3TKUHLE75K","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"NS3TKUHLE75KCFYP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"NS3TKUHL","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:NS3TKUHLE75KCFYPPQZFWZH7MD","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14503","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SE","submitted_at":"2026-05-14T07:47:44Z","cross_cats_sorted":[],"title_canon_sha256":"2d9a2a7d20840e20242d6d4a50784bdef00b1b46df3cf6b8b31aee1e2c1d2057","abstract_canon_sha256":"eefc97159af8dfc774e669a4c648396f531565b4c582975463e78d529d0c725a"},"schema_version":"1.0"},"canonical_sha256":"6cb73550eb27faa1170f7c325b64ff60ea01fa8ebf2e13632b93574984016f39","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:06.291603Z","signature_b64":"3/LWRAVdVNo6ZtU40+iLTzH71xlLttJoWSa4HSbzXwcNCYuNQLQWOe4MtZ2qOsB79xNJ9AdAo12tPbKOANYtAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6cb73550eb27faa1170f7c325b64ff60ea01fa8ebf2e13632b93574984016f39","last_reissued_at":"2026-05-17T23:39:06.290692Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:06.290692Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14503","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"9fyhuXtvYnj6dzkVm4v0wgv4IBPvjrVxRtyQdXdTVvZgKxlGiUY6lYmsP/dVZk+FcZ+BXLuHwq5bpJOEQG6VBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-07T11:48:00.076241Z"},"content_sha256":"3e78243d9bc5a4ebbb67035fa07869e388316e346916c93443644292ffb0f301","schema_version":"1.0","event_id":"sha256:3e78243d9bc5a4ebbb67035fa07869e388316e346916c93443644292ffb0f301"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:NS3TKUHLE75KCFYPPQZFWZH7MD","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Not All RAGs Are Created Equal: A Component-Wise Empirical Study for Software Engineering Tasks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Retriever components, especially the algorithm, often influence RAG performance for software engineering tasks more than the generator model.","cross_cats":[],"primary_cat":"cs.SE","authors_text":"Haoyu Wang, Hongjin Leng, Qiang Ke, Shengming Zhao, Yanjie Zhao","submitted_at":"2026-05-14T07:47:44Z","abstract_excerpt":"While Retrieval-Augmented Generation (RAG) is increasingly adopted to ground Large Language Models (LLMs) in software artifacts, the optimal configuration of its components remains an open question for software engineering (SE) tasks. The lack of systematic guidance forces practitioners into costly, ad-hoc experimentation. This paper presents a comprehensive, component-wise empirical study that dissects the RAG pipeline, evaluating over 21 distinct models and methods. Our study systematically isolates and evaluates 4 query processing techniques, 7 retrieval models spanning sparse, dense, and h"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"the retriever-side components, particularly the choice of the retrieval algorithm, often exert a more significant influence on final system performance than the selection of the generator model","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the three chosen SE tasks and the specific models and datasets used are representative enough for the observed component rankings to generalize to other software engineering contexts and real-world codebases.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Retriever-side choices, particularly the retrieval algorithm, exert more influence on RAG performance than generator selection across code generation, summarization, and repair tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Retriever components, especially the algorithm, often influence RAG performance for software engineering tasks more than the generator model.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"a35e355273df0fb75d30b785e3f4aec29e307978a7a23890d2460d7fa5650283"},"source":{"id":"2605.14503","kind":"arxiv","version":1},"verdict":{"id":"1b2eab88-e4b5-4171-8c36-ed3cd8ee69f9","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T01:48:57.380310Z","strongest_claim":"the retriever-side components, particularly the choice of the retrieval algorithm, often exert a more significant influence on final system performance than the selection of the generator model","one_line_summary":"Retriever-side choices, particularly the retrieval algorithm, exert more influence on RAG performance than generator selection across code generation, summarization, and repair tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the three chosen SE tasks and the specific models and datasets used are representative enough for the observed component rankings to generalize to other software engineering contexts and real-world codebases.","pith_extraction_headline":"Retriever components, especially the algorithm, often influence RAG performance for software engineering tasks more than the generator model."},"references":{"count":56,"sample":[{"doi":"","year":2024,"title":"Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone","work_id":"feef9556-a016-493c-abd2-0c97a23a7ebf","ref_index":1,"cited_arxiv_id":"2404.14219","is_internal_anchor":true},{"doi":"10.1145/3290353","year":2019,"title":"Code2vec: Learning distributed representations of code","work_id":"e7c7cc54-ab7b-46f7-87f8-2ba8df2cb944","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.1016/j.procs.2024.09.178","year":2024,"title":"Muhammad Arslan, Hussam Ghanem, Saba Munawar, and Christophe Cruz. 2024. A Survey on RAG with LLMs. Procedia Computer Science246 (2024), 3781–3790. doi:10.1016/j.procs.2024.09.178","work_id":"9d021890-9f08-4851-b877-f7b5f90dc7cf","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Cweval: Outcome-driven evaluation on functionality and security of LLM code generation","work_id":"4d56f933-3be6-4f16-9456-edbce6af7c36","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.1109/icsme64153.2025.00046","year":2025,"title":"Nguyen, Hridesh Rajan, Nikolaos Tsantalis, and Danny Dig","work_id":"f38aeca9-0069-4116-b7e2-f8a094f0c98e","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":56,"snapshot_sha256":"03b105fa2b4d67ac4c5d05a033b2d334d50d0c34b85e44e036435a4d3dc88b2a","internal_anchors":20},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"1b2eab88-e4b5-4171-8c36-ed3cd8ee69f9"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"mUyHduvnJNy/6orJ1P6BIIgrBgfSIDYa/Z/921mvKEw4QxbaOKCcnTeUVNNvVgoqCElZ1QkJT83/JqDvsS/7Ag==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-07T11:48:00.077293Z"},"content_sha256":"4e38cefb61cf16dc0506b67996f99cb9ed8d204e33b748c7555ca8a77be4328d","schema_version":"1.0","event_id":"sha256:4e38cefb61cf16dc0506b67996f99cb9ed8d204e33b748c7555ca8a77be4328d"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/NS3TKUHLE75KCFYPPQZFWZH7MD/bundle.json","state_url":"https://pith.science/pith/NS3TKUHLE75KCFYPPQZFWZH7MD/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/NS3TKUHLE75KCFYPPQZFWZH7MD/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-07T11:48:00Z","links":{"resolver":"https://pith.science/pith/NS3TKUHLE75KCFYPPQZFWZH7MD","bundle":"https://pith.science/pith/NS3TKUHLE75KCFYPPQZFWZH7MD/bundle.json","state":"https://pith.science/pith/NS3TKUHLE75KCFYPPQZFWZH7MD/state.json","well_known_bundle":"https://pith.science/.well-known/pith/NS3TKUHLE75KCFYPPQZFWZH7MD/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:NS3TKUHLE75KCFYPPQZFWZH7MD","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"eefc97159af8dfc774e669a4c648396f531565b4c582975463e78d529d0c725a","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SE","submitted_at":"2026-05-14T07:47:44Z","title_canon_sha256":"2d9a2a7d20840e20242d6d4a50784bdef00b1b46df3cf6b8b31aee1e2c1d2057"},"schema_version":"1.0","source":{"id":"2605.14503","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14503","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14503v1","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14503","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"pith_short_12","alias_value":"NS3TKUHLE75K","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"NS3TKUHLE75KCFYP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"NS3TKUHL","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:4e38cefb61cf16dc0506b67996f99cb9ed8d204e33b748c7555ca8a77be4328d","target":"graph","created_at":"2026-05-17T23:39:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"the retriever-side components, particularly the choice of the retrieval algorithm, often exert a more significant influence on final system performance than the selection of the generator model"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the three chosen SE tasks and the specific models and datasets used are representative enough for the observed component rankings to generalize to other software engineering contexts and real-world codebases."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Retriever-side choices, particularly the retrieval algorithm, exert more influence on RAG performance than generator selection across code generation, summarization, and repair tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Retriever components, especially the algorithm, often influence RAG performance for software engineering tasks more than the generator model."}],"snapshot_sha256":"a35e355273df0fb75d30b785e3f4aec29e307978a7a23890d2460d7fa5650283"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"While Retrieval-Augmented Generation (RAG) is increasingly adopted to ground Large Language Models (LLMs) in software artifacts, the optimal configuration of its components remains an open question for software engineering (SE) tasks. The lack of systematic guidance forces practitioners into costly, ad-hoc experimentation. This paper presents a comprehensive, component-wise empirical study that dissects the RAG pipeline, evaluating over 21 distinct models and methods. Our study systematically isolates and evaluates 4 query processing techniques, 7 retrieval models spanning sparse, dense, and h","authors_text":"Haoyu Wang, Hongjin Leng, Qiang Ke, Shengming Zhao, Yanjie Zhao","cross_cats":[],"headline":"Retriever components, especially the algorithm, often influence RAG performance for software engineering tasks more than the generator model.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SE","submitted_at":"2026-05-14T07:47:44Z","title":"Not All RAGs Are Created Equal: A Component-Wise Empirical Study for Software Engineering Tasks"},"references":{"count":56,"internal_anchors":20,"resolved_work":56,"sample":[{"cited_arxiv_id":"2404.14219","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone","work_id":"feef9556-a016-493c-abd2-0c97a23a7ebf","year":2024},{"cited_arxiv_id":"","doi":"10.1145/3290353","is_internal_anchor":false,"ref_index":2,"title":"Code2vec: Learning distributed representations of code","work_id":"e7c7cc54-ab7b-46f7-87f8-2ba8df2cb944","year":2019},{"cited_arxiv_id":"","doi":"10.1016/j.procs.2024.09.178","is_internal_anchor":false,"ref_index":3,"title":"Muhammad Arslan, Hussam Ghanem, Saba Munawar, and Christophe Cruz. 2024. A Survey on RAG with LLMs. Procedia Computer Science246 (2024), 3781–3790. doi:10.1016/j.procs.2024.09.178","work_id":"9d021890-9f08-4851-b877-f7b5f90dc7cf","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Cweval: Outcome-driven evaluation on functionality and security of LLM code generation","work_id":"4d56f933-3be6-4f16-9456-edbce6af7c36","year":2025},{"cited_arxiv_id":"","doi":"10.1109/icsme64153.2025.00046","is_internal_anchor":false,"ref_index":5,"title":"Nguyen, Hridesh Rajan, Nikolaos Tsantalis, and Danny Dig","work_id":"f38aeca9-0069-4116-b7e2-f8a094f0c98e","year":2025}],"snapshot_sha256":"03b105fa2b4d67ac4c5d05a033b2d334d50d0c34b85e44e036435a4d3dc88b2a"},"source":{"id":"2605.14503","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T01:48:57.380310Z","id":"1b2eab88-e4b5-4171-8c36-ed3cd8ee69f9","model_set":{"reader":"grok-4.3"},"one_line_summary":"Retriever-side choices, particularly the retrieval algorithm, exert more influence on RAG performance than generator selection across code generation, summarization, and repair tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Retriever components, especially the algorithm, often influence RAG performance for software engineering tasks more than the generator model.","strongest_claim":"the retriever-side components, particularly the choice of the retrieval algorithm, often exert a more significant influence on final system performance than the selection of the generator model","weakest_assumption":"That the three chosen SE tasks and the specific models and datasets used are representative enough for the observed component rankings to generalize to other software engineering contexts and real-world codebases."}},"verdict_id":"1b2eab88-e4b5-4171-8c36-ed3cd8ee69f9"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:3e78243d9bc5a4ebbb67035fa07869e388316e346916c93443644292ffb0f301","target":"record","created_at":"2026-05-17T23:39:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"eefc97159af8dfc774e669a4c648396f531565b4c582975463e78d529d0c725a","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SE","submitted_at":"2026-05-14T07:47:44Z","title_canon_sha256":"2d9a2a7d20840e20242d6d4a50784bdef00b1b46df3cf6b8b31aee1e2c1d2057"},"schema_version":"1.0","source":{"id":"2605.14503","kind":"arxiv","version":1}},"canonical_sha256":"6cb73550eb27faa1170f7c325b64ff60ea01fa8ebf2e13632b93574984016f39","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"6cb73550eb27faa1170f7c325b64ff60ea01fa8ebf2e13632b93574984016f39","first_computed_at":"2026-05-17T23:39:06.290692Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:06.290692Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"3/LWRAVdVNo6ZtU40+iLTzH71xlLttJoWSa4HSbzXwcNCYuNQLQWOe4MtZ2qOsB79xNJ9AdAo12tPbKOANYtAg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:06.291603Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14503","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:3e78243d9bc5a4ebbb67035fa07869e388316e346916c93443644292ffb0f301","sha256:4e38cefb61cf16dc0506b67996f99cb9ed8d204e33b748c7555ca8a77be4328d"],"state_sha256":"b339195dfe46af89610ab3356d48f61b7b9e9178dd91730d41843455e39bac01"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"hHM0JcdGApkmeh0S0ZyU9Ujim8DVQAyINaGUOcE5HX4+gN4OTBW+KYb1YTEd1+DY+9kmxIKln2YSQ4gOh4gRDg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-07T11:48:00.081738Z","bundle_sha256":"63c9c12a272c67d1fa004e70d7b077541a74cbb0da4a452cc56d9bc87af3eb19"}}