{"total":14,"items":[{"citing_arxiv_id":"2606.25588","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"IntentTester: Intent-Driven Multi-agent Framework for Cross-Library Test Migration","primary_cat":"cs.SE","submitted_at":"2026-06-24T08:59:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IntentTester migrates tests across libraries using TDL abstraction and multi-agent LLM synthesis, achieving 85% correctness and 74% effectiveness versus 51% and 43% for baselines on nine projects in JSON, HTML, and Time domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22711","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Simpson's Paradox: A Cascade of Confounders in AI Agent Pull-Request Co-Authorship","primary_cat":"cs.SE","submitted_at":"2026-06-21T23:16:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Stratified analysis of AIDev PRs shows co-authorship effects on AI agent merge rates are artefacts of agent composition, repository selection, and PR commit structure rather than causal benefits.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22475","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"All Green, Still Broken: Real-Flow Verification Lessons from an LLM-Integrated, Multi-Market Web Application","primary_cat":"cs.SE","submitted_at":"2026-06-21T12:34:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Analysis of 252 bug fixes in an LLM-powered multi-market web app found 44% escaped through four seams invisible to component unit tests, motivating a four-seam verification framework.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05493","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"REStack: A Large-Scale Dataset of Reverse Engineering Discussions from Stack Exchange","primary_cat":"cs.SE","submitted_at":"2026-06-03T22:34:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"REStack is a new public dataset of 12k+ RE discussions from Stack Exchange sites, enriched with 23 LDA-derived topics grouped into six categories and community-derived difficulty metadata.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30777","ref_index":79,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"What Breaks When LLMs Code? Characterizing Operational Safety Failures of Agentic Code Assistants","primary_cat":"cs.SE","submitted_at":"2026-05-29T03:09:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"An empirical study of 547 confirmed safety incidents from GitHub and literature derives a 33-type taxonomy showing constraint violations, destructive actions, and deception dominate in everyday coding-agent use.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07957","ref_index":77,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Similar Pattern Annotation via Retrieval Knowledge for LLM-Based Test Code Fault Localization","primary_cat":"cs.SE","submitted_at":"2026-05-08T16:20:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPARK improves LLM-based test code fault localization by retrieving similar past faults and selectively annotating suspicious lines in new failing tests.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"RAP-Gen: Retrieval-Augmented Patch Generation with CodeT5 for Automatic Program Repair. InProceedings of the 31st ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering(San Francisco, CA, USA)(ESEC/FSE 2023). Association for Computing Machinery, New York, NY, USA, 146âĂŞ158. doi:10.1145/3611643. 3616256 [77] Ying Wang, Bihuan Chen, Kaifeng Huang, Bowen Shi, Congying Xu, Xin Peng, Yijian Wu, and Yang Liu. 2020. An Empirical Study of Usages, Updates and Risks of Third-Party Libraries in Java Projects. InIEEE International Conference on Software Maintenance and Evolution, ICSME 2020, Adelaide, Australia, September 28 - October 2, 2020. IEEE, 35-45. doi:10."},{"citing_arxiv_id":"2605.00922","ref_index":255,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"To Vibe Research or Not to Vibe Research? Generative AI in Qualitative Research","primary_cat":"cs.SE","submitted_at":"2026-04-30T17:19:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Generative AI suitability in qualitative research depends primarily on the approach (small-q positivist/post-positivist or Big Q non-positivist) along with skills, ethics, and personal preferences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24072","ref_index":21,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"How Do Developers Use Migration Guides? A Case Study of Log4j","primary_cat":"cs.SE","submitted_at":"2026-04-27T05:59:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Developers most frequently reference the full Log4j migration guide in pull request descriptions (82.81% of cases) and continue consulting it during post-update maintenance tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17529","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Single-Language Evidence Is Insufficient for Automated Logging: A Multilingual Benchmark and Empirical Study with LLMs","primary_cat":"cs.SE","submitted_at":"2026-04-19T16:43:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MultiLogBench shows that LLM performance on automated logging varies substantially across programming languages, demonstrating that single-language evidence is insufficient for general claims about model behavior or tool design.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"The motivation is that both modern code LLMs and general-purpose LLMs are trained on large-scale public code corpora, creating a meaningful risk that benchmark instances mined directly from GitHub may overlap with the models' pretraining data [63]. In this setting, strong performance on the original benchmark may reflect a mixture of genuine inference and memorization [53, 62]. Following prior work on contamination-aware evaluation in automated logging [33], we construct an additional paired evaluation set in which the surrounding code is rewritten into a surface-distinct but target-equivalent form whenever a valid transformation is available, while the target logging decision remains unchanged. The goal is not to make the code artificially obscure, but to test whether"},{"citing_arxiv_id":"2512.00380","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Knowledge-Graph-Driven Data Synthesis for Low-Resource Software Development: A HarmonyOS Case Study","primary_cat":"cs.SE","submitted_at":"2025-11-29T08:13:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"APIKG4Syn synthesizes API-oriented training data via knowledge graphs and Monte Carlo search to fine-tune a 7B model that reaches 25% pass@1 on HarmonyOS code generation, beating untuned GPT-4o at 17.59%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.10823","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Paradigm Shift to Audit Rift: Empirical Analysis and Validation of Security Audit Methodologies for Asynchronous Smart Contract Systems","primary_cat":"cs.CR","submitted_at":"2025-09-13T14:41:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Empirical review of 233 real-world vulnerabilities from 34 TON audits produces a specialized checklist for asynchronous message handling, supported by case studies and an 11-person practitioner survey.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.19045","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Efficient Black-Box Fault Localization for System-Level Test Code Using Large Language Models","primary_cat":"cs.SE","submitted_at":"2025-06-23T19:04:51+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.15815","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MR-Adopt: Automatic Deduction of Input Transformation Function for Metamorphic Testing","primary_cat":"cs.SE","submitted_at":"2024-08-28T14:24:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MR-Adopt deduces input transformations from hard-coded MR test cases using LLMs, data-flow refinement, and output-relation selection to enable reuse with new source inputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2304.07548","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MR-Scout: Automated Synthesis of Metamorphic Relations from Existing Test Cases","primary_cat":"cs.SE","submitted_at":"2023-04-15T12:53:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MR-Scout extracts over 11,000 metamorphic-relation-encoded test cases from 701 OSS projects, codifies 97% of them as high-quality generators, and shows they raise line coverage by 13.52% and mutation score by 9.42% on programs that already have developer tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}