{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:RTJFRTJVNTUIQRGSYKVKHLYTSH","short_pith_number":"pith:RTJFRTJV","schema_version":"1.0","canonical_sha256":"8cd258cd356ce88844d2c2aaa3af1391ebdf2c556c7903a4099e6ad791cca20a","source":{"kind":"arxiv","id":"2605.15537","version":1},"attestation_state":"computed","paper":{"title":"RTL-BenchMT: Dynamic Maintenance of RTL Generation Benchmark Through Agent-Assisted Analysis and Revision","license":"http://creativecommons.org/licenses/by/4.0/","headline":"An agentic framework automatically identifies flawed RTL benchmark cases and detects overfitting to produce a refined suite.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Hangan Zhou, Jing Wang, Shang Liu, Zhiyao Xie","submitted_at":"2026-05-15T02:17:46Z","abstract_excerpt":"This paper introduces RTL-BenchMT, an agentic framework for dynamically maintaining RTL generation benchmarks. Large Language Models (LLMs) assisted automated RTL generation is one of the most important directions in EDA research. However, current RTL benchmarks face two critical challenges: (1) flawed cases in the benchmarks and (2) overfitting to the benchmarks. Both challenges are difficult to resolve purely by manual engineering effort. To address these issues and systematically reduce human maintenance costs, we propose an automated agentic framework, RTL-BenchMT. RTL-BenchMT focuses on t"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2605.15537","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-15T02:17:46Z","cross_cats_sorted":[],"title_canon_sha256":"1f71be7c1cf100268d7be3729717ec7370b7cae987c2cd84a54e2bb44f5ee70c","abstract_canon_sha256":"77448166df1ba81182aa49d40ee0d1fb855f31b15510469b6de6dfd93c09400b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:01:04.132292Z","signature_b64":"px6Ccrt+hEu5S9y62TkVA7/I8Ylmhhtr4yxQxaf1z2DFRDeLgzu6YejQ5pfFQfnuArOrQaLxUXtD1WUlaLjfCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8cd258cd356ce88844d2c2aaa3af1391ebdf2c556c7903a4099e6ad791cca20a","last_reissued_at":"2026-05-20T00:01:04.131587Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:01:04.131587Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"RTL-BenchMT: Dynamic Maintenance of RTL Generation Benchmark Through Agent-Assisted Analysis and Revision","license":"http://creativecommons.org/licenses/by/4.0/","headline":"An agentic framework automatically identifies flawed RTL benchmark cases and detects overfitting to produce a refined suite.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Hangan Zhou, Jing Wang, Shang Liu, Zhiyao Xie","submitted_at":"2026-05-15T02:17:46Z","abstract_excerpt":"This paper introduces RTL-BenchMT, an agentic framework for dynamically maintaining RTL generation benchmarks. Large Language Models (LLMs) assisted automated RTL generation is one of the most important directions in EDA research. However, current RTL benchmarks face two critical challenges: (1) flawed cases in the benchmarks and (2) overfitting to the benchmarks. Both challenges are difficult to resolve purely by manual engineering effort. To address these issues and systematically reduce human maintenance costs, we propose an automated agentic framework, RTL-BenchMT. RTL-BenchMT focuses on t"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"With the assistance of RTL-BenchMT, we conduct a thorough, in-depth analysis of flawed and overfitting cases and produce a refined benchmark suite that will be open-sourced to the community.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That AI agents can reliably and accurately detect flawed benchmark cases and overfitting instances in RTL generation tasks without introducing new errors or requiring substantial human validation.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"RTL-BenchMT is an agent-assisted framework for dynamically maintaining RTL generation benchmarks by fixing flaws and reducing overfitting in LLM-based EDA applications.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"An agentic framework automatically identifies flawed RTL benchmark cases and detects overfitting to produce a refined suite.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"6a883a501a5ae806944fc016d31d98336e8344bcd35c8bc70007048cfe882688"},"source":{"id":"2605.15537","kind":"arxiv","version":1},"verdict":{"id":"bfc44f51-0872-40bb-ad19-81dd00dee9af","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T14:37:03.553750Z","strongest_claim":"With the assistance of RTL-BenchMT, we conduct a thorough, in-depth analysis of flawed and overfitting cases and produce a refined benchmark suite that will be open-sourced to the community.","one_line_summary":"RTL-BenchMT is an agent-assisted framework for dynamically maintaining RTL generation benchmarks by fixing flaws and reducing overfitting in LLM-based EDA applications.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That AI agents can reliably and accurately detect flawed benchmark cases and overfitting instances in RTL generation tasks without introducing new errors or requiring substantial human validation.","pith_extraction_headline":"An agentic framework automatically identifies flawed RTL benchmark cases and detects overfitting to produce a refined suite."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.15537/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"doi_title_agreement","ran_at":"2026-05-19T15:01:17.497187Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T14:49:56.481343Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"cited_work_retraction","ran_at":"2026-05-19T14:22:01.493804Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"claim_evidence","ran_at":"2026-05-19T14:21:54.030847Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"shingle_duplication","ran_at":"2026-05-19T13:49:41.830243Z","status":"skipped","version":"0.1.0","findings_count":0},{"name":"citation_quote_validity","ran_at":"2026-05-19T13:49:41.367480Z","status":"skipped","version":"0.1.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T13:33:22.615589Z","status":"skipped","version":"1.0.0","findings_count":0}],"snapshot_sha256":"7a6e9dc8eaa2c6a7d3202a96e144c377f549eb9cc3c97ec31aaae6d3eacabd9e"},"references":{"count":20,"sample":[{"doi":"","year":2023,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","ref_index":1,"cited_arxiv_id":"2303.08774","is_internal_anchor":true},{"doi":"","year":2025,"title":"Mohammad Akyash, Kimia Azar, and Hadi Kamali. 2025. DecoRTL: A Run- time Decoding Framework for RTL Code Generation with LLMs.arXiv preprint arXiv:2507.02226(2025)","work_id":"6a642ad9-8d02-49b3-abf9-29152b01715d","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"doi:10.48550/arXiv.2502.07445 arXiv:2502.07445 [cs]","work_id":"d59b41eb-1454-4f6c-9562-48772f4fcc27","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Origen: Enhancing rtl code generation with code-to-code augmentation and self-reflection","work_id":"4db2bc96-cacf-4fe7-9083-2fbd6f57ebe1","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Verilogcoder: Autonomous verilog coding agents with graph-based planning and abstract syntax tree (ast)-based waveform tracing tool","work_id":"0515df68-0b7f-4444-9310-0fb92971f10c","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":20,"snapshot_sha256":"15332168517bb8c1fa4215c7cb055b611f4a14549275256caa824e40ff6b7589","internal_anchors":3},"formal_canon":{"evidence_count":1,"snapshot_sha256":"59686466f8b3a75c12ca5812bf206f8baacdf2b3456fdc7418d7ed03f37d142f"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.15537","created_at":"2026-05-20T00:01:04.131717+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.15537v1","created_at":"2026-05-20T00:01:04.131717+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.15537","created_at":"2026-05-20T00:01:04.131717+00:00"},{"alias_kind":"pith_short_12","alias_value":"RTJFRTJVNTUI","created_at":"2026-05-20T00:01:04.131717+00:00"},{"alias_kind":"pith_short_16","alias_value":"RTJFRTJVNTUIQRGS","created_at":"2026-05-20T00:01:04.131717+00:00"},{"alias_kind":"pith_short_8","alias_value":"RTJFRTJV","created_at":"2026-05-20T00:01:04.131717+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":1,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/RTJFRTJVNTUIQRGSYKVKHLYTSH","json":"https://pith.science/pith/RTJFRTJVNTUIQRGSYKVKHLYTSH.json","graph_json":"https://pith.science/api/pith-number/RTJFRTJVNTUIQRGSYKVKHLYTSH/graph.json","events_json":"https://pith.science/api/pith-number/RTJFRTJVNTUIQRGSYKVKHLYTSH/events.json","paper":"https://pith.science/paper/RTJFRTJV"},"agent_actions":{"view_html":"https://pith.science/pith/RTJFRTJVNTUIQRGSYKVKHLYTSH","download_json":"https://pith.science/pith/RTJFRTJVNTUIQRGSYKVKHLYTSH.json","view_paper":"https://pith.science/paper/RTJFRTJV","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.15537&json=true","fetch_graph":"https://pith.science/api/pith-number/RTJFRTJVNTUIQRGSYKVKHLYTSH/graph.json","fetch_events":"https://pith.science/api/pith-number/RTJFRTJVNTUIQRGSYKVKHLYTSH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/RTJFRTJVNTUIQRGSYKVKHLYTSH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/RTJFRTJVNTUIQRGSYKVKHLYTSH/action/storage_attestation","attest_author":"https://pith.science/pith/RTJFRTJVNTUIQRGSYKVKHLYTSH/action/author_attestation","sign_citation":"https://pith.science/pith/RTJFRTJVNTUIQRGSYKVKHLYTSH/action/citation_signature","submit_replication":"https://pith.science/pith/RTJFRTJVNTUIQRGSYKVKHLYTSH/action/replication_record"}},"created_at":"2026-05-20T00:01:04.131717+00:00","updated_at":"2026-05-20T00:01:04.131717+00:00"}