{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:23J2LGEKOOQ2WZOMQUWVIVVKOI","short_pith_number":"pith:23J2LGEK","canonical_record":{"source":{"id":"2605.15224","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T08:50:05Z","cross_cats_sorted":["cs.MA"],"title_canon_sha256":"3a24ce8aed65cb35d8fad1a68935797323fb6d2b6763ded61970ac03e6bdc42e","abstract_canon_sha256":"9fdc1d3d1bd380c0214ee6bf2cb1932af79419a7933be850497b565dc9bfeb89"},"schema_version":"1.0"},"canonical_sha256":"d6d3a5988a73a1ab65cc852d5456aa722a1bbb89089139b366d27d177770154e","source":{"kind":"arxiv","id":"2605.15224","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15224","created_at":"2026-05-20T00:00:47Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15224v1","created_at":"2026-05-20T00:00:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.15224","created_at":"2026-05-20T00:00:47Z"},{"alias_kind":"pith_short_12","alias_value":"23J2LGEKOOQ2","created_at":"2026-05-20T00:00:47Z"},{"alias_kind":"pith_short_16","alias_value":"23J2LGEKOOQ2WZOM","created_at":"2026-05-20T00:00:47Z"},{"alias_kind":"pith_short_8","alias_value":"23J2LGEK","created_at":"2026-05-20T00:00:47Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:23J2LGEKOOQ2WZOMQUWVIVVKOI","target":"record","payload":{"canonical_record":{"source":{"id":"2605.15224","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T08:50:05Z","cross_cats_sorted":["cs.MA"],"title_canon_sha256":"3a24ce8aed65cb35d8fad1a68935797323fb6d2b6763ded61970ac03e6bdc42e","abstract_canon_sha256":"9fdc1d3d1bd380c0214ee6bf2cb1932af79419a7933be850497b565dc9bfeb89"},"schema_version":"1.0"},"canonical_sha256":"d6d3a5988a73a1ab65cc852d5456aa722a1bbb89089139b366d27d177770154e","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:00:47.194918Z","signature_b64":"nWWTBevHMv5e9dpbZjExd1+XpcVOe4goXL24WRu09ZtoMqd2xbAqtK1KqENY1IuQQqTO4EAVXyCm8liyLNoMCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d6d3a5988a73a1ab65cc852d5456aa722a1bbb89089139b366d27d177770154e","last_reissued_at":"2026-05-20T00:00:47.193945Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:00:47.193945Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.15224","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:00:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Ew/sPKQxJzr4/DQeCUKHEVITTPA/URjoEZRbn1AmtxsSdSu5fNMQXFWJdgOuLXK0vY8FmxG8aB3AKPa7as8wBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T02:28:44.700683Z"},"content_sha256":"400b52d659978990e18be74e966884e783b0480b82bb75c622c0a427ea8a5888","schema_version":"1.0","event_id":"sha256:400b52d659978990e18be74e966884e783b0480b82bb75c622c0a427ea8a5888"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:23J2LGEKOOQ2WZOMQUWVIVVKOI","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"ICRL: Learning to Internalize Self-Critique with Reinforcement Learning","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"ICRL jointly trains a solver and critic from one backbone so critique gains become part of unassisted performance.","cross_cats":["cs.MA"],"primary_cat":"cs.AI","authors_text":"Chengwei Qin, Heqing Zou, Hui Xiong, Jianbo Lin, Weishi Wang, Xiaomin Yu, Yifu Guo, Yi Xin, Zhongqi Yue, Zhuosong Jiang","submitted_at":"2026-05-13T08:50:05Z","abstract_excerpt":"Large language model-based agents make mistakes, yet critique can often guide the same model toward correct behavior. However, when critique is removed, the model may fail again on the same query, indicating that it has not internalized the critique's guidance into its underlying capability. Meanwhile, a frozen critic cannot improve its feedback quality over time, limiting the potential for iterative self-improvement. To address this, we propose learning to internalize self-critique with reinforcement learning(ICRL), a novel framework that jointly trains a solver and a critic from a shared bac"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"ICRL jointly trains a solver and a critic from a shared backbone to convert critique-induced success into unassisted solver ability, with results showing average gains of 6.4 points over GRPO on agentic tasks and 7.0 points on mathematical reasoning using Qwen3-4B and Qwen3-8B backbones.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The distribution-calibration re-weighting ratio successfully selects critique-guided improvements that remain compatible with the solver's own prompt distribution, without introducing bias or reducing performance on critique-free queries.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"ICRL uses joint RL training of solver and critic with distribution-calibration re-weighting and role-wise advantage estimation to internalize critique into unassisted LLM performance, yielding 6.4-point gains on agentic tasks and 7.0 on math reasoning with Qwen3 models.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"ICRL jointly trains a solver and critic from one backbone so critique gains become part of unassisted performance.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"af6ba23ad3909c298836d3bae9c60967f95ccffccd2f0916260f6b843232e1c2"},"source":{"id":"2605.15224","kind":"arxiv","version":1},"verdict":{"id":"13d6bd54-1a86-4c7b-97ae-b4aa295777c2","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T17:53:19.786250Z","strongest_claim":"ICRL jointly trains a solver and a critic from a shared backbone to convert critique-induced success into unassisted solver ability, with results showing average gains of 6.4 points over GRPO on agentic tasks and 7.0 points on mathematical reasoning using Qwen3-4B and Qwen3-8B backbones.","one_line_summary":"ICRL uses joint RL training of solver and critic with distribution-calibration re-weighting and role-wise advantage estimation to internalize critique into unassisted LLM performance, yielding 6.4-point gains on agentic tasks and 7.0 on math reasoning with Qwen3 models.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The distribution-calibration re-weighting ratio successfully selects critique-guided improvements that remain compatible with the solver's own prompt distribution, without introducing bias or reducing performance on critique-free queries.","pith_extraction_headline":"ICRL jointly trains a solver and critic from one backbone so critique gains become part of unassisted performance."},"integrity":{"clean":false,"summary":{"advisory":1,"critical":0,"by_detector":{"doi_compliance":{"total":1,"advisory":1,"critical":0,"informational":0}},"informational":0},"endpoint":"/pith/2605.15224/integrity.json","findings":[{"note":"DOI in the printed bibliography is fragmented by whitespace or line breaks. A longer candidate (10.1162/TACL\\_A\\_00475) was visible in the surrounding text but could not be confirmed against doi.org as printed.","detector":"doi_compliance","severity":"advisory","ref_index":28,"audited_at":"2026-05-19T18:01:24.690308Z","detected_doi":"10.1162/TACL\\_A\\_00475","finding_type":"recoverable_identifier","verdict_class":"incontrovertible","detected_arxiv_id":null}],"available":true,"detectors_run":[{"name":"claim_evidence","ran_at":"2026-05-19T20:01:57.074166Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T18:01:24.690308Z","status":"completed","version":"1.0.0","findings_count":1},{"name":"doi_title_agreement","ran_at":"2026-05-19T18:01:18.624674Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T13:33:22.834146Z","status":"skipped","version":"1.0.0","findings_count":0}],"snapshot_sha256":"f3643a4cdb68bb505c3764342fd5bf2145110ad50029bfc1ce18115a0c39c457"},"references":{"count":45,"sample":[{"doi":"","year":null,"title":"Advances in neural information processing systems , volume=","work_id":"782a3ae1-f760-4d7d-98b8-85da324ebad6","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Advances in neural information processing systems , volume=","work_id":"07ff1f09-5238-40b1-9239-0e4afc5e9787","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"CRITIC: Large Language Models Can Self-Correct with Tool-Interactive Critiquing","work_id":"dcdfa0ce-7ed4-4614-acd9-8bdbf491fc26","ref_index":3,"cited_arxiv_id":"2305.11738","is_internal_anchor":true},{"doi":"","year":null,"title":"Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection , author=. ArXiv , year=","work_id":"3c43afea-eb94-4455-88d0-374efd5460c9","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"arXiv preprint arXiv:2303.16755 , year=","work_id":"18748c48-f59c-4357-8a13-e3854e429670","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":45,"snapshot_sha256":"5ef9a412375816af24c7a31bad52b7ee7876c64b938b29fcbe870abf0252d219","internal_anchors":12},"formal_canon":{"evidence_count":2,"snapshot_sha256":"145503fd6ca4ffa37b9cfcda91b08ccba6b56dea5fcfd73b7e981c39b1642c3e"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"13d6bd54-1a86-4c7b-97ae-b4aa295777c2"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:00:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"RWpGsJZj8GzmrtzTxzptRnzrHAt2d7jXypXPUNEfnmA1eKrCE1DiW0eNAFl17IuRREuGgbp+UcMHZXT1XoXFCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T02:28:44.701921Z"},"content_sha256":"643c5e56b599ed0ee040c443a14566dc0c3e9336c367d7aa49f694f6f0cabcdd","schema_version":"1.0","event_id":"sha256:643c5e56b599ed0ee040c443a14566dc0c3e9336c367d7aa49f694f6f0cabcdd"},{"event_type":"integrity_finding","subject_pith_number":"pith:2026:23J2LGEKOOQ2WZOMQUWVIVVKOI","target":"integrity","payload":{"note":"DOI in the printed bibliography is fragmented by whitespace or line breaks. A longer candidate (10.1162/TACL\\_A\\_00475) was visible in the surrounding text but could not be confirmed against doi.org as printed.","snippet":"Harsh Trivedi and Niranjan Balasubramanian and Tushar Khot and Ashish Sabharwal , title =. Trans. Assoc. Comput. Linguistics , volume =. 2022 , url =. doi:10.1162/TACL\\_A\\_00475 , timestamp =","arxiv_id":"2605.15224","detector":"doi_compliance","evidence":{"ref_index":28,"verdict_class":"incontrovertible","resolved_title":null,"printed_excerpt":"Harsh Trivedi and Niranjan Balasubramanian and Tushar Khot and Ashish Sabharwal , title =. Trans. Assoc. Comput. Linguistics , volume =. 2022 , url =. doi:10.1162/TACL\\_A\\_00475 , timestamp =","reconstructed_doi":"10.1162/TACL\\_A\\_00475"},"severity":"advisory","ref_index":28,"audited_at":"2026-05-19T18:01:24.690308Z","event_type":"pith.integrity.v1","detected_doi":"10.1162/TACL\\_A\\_00475","detector_url":"https://pith.science/pith-integrity-protocol#doi_compliance","external_url":null,"finding_type":"recoverable_identifier","evidence_hash":"e575b6c1d032c7161817414770723158edb06302169b52c4bc17e737f0eef9d8","paper_version":1,"verdict_class":"incontrovertible","resolved_title":null,"detector_version":"1.0.0","detected_arxiv_id":null,"integrity_event_id":2387,"payload_sha256":"626fca926327f9f066c9ae6f0921cfb3c716a5bae2657d0f1b5a51d4e3f187e1","signature_b64":"+vCoqZ4RflfoDuA/hCkS2UMZsWM1CTLKi17qjhm3edEZ1kiFdOWul0EuOuYiHBxe3FY4/vjS2dIqMU0odlmFBA==","signing_key_id":"pith-v1-2026-05"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-19T18:02:16Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"S25Z+17p5lMgvGBPU6Aq34BvHWG/ZTKkP+vbUVxXysc/TDjEufcsJZqCYt4rQD33MNaEbr1YbPg3H9pfCivpBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T02:28:44.703251Z"},"content_sha256":"ab2d50de2f4dba5863dbbf9d6ac957c85c8eb2493573615c6f8ddb09622d981f","schema_version":"1.0","event_id":"sha256:ab2d50de2f4dba5863dbbf9d6ac957c85c8eb2493573615c6f8ddb09622d981f"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/23J2LGEKOOQ2WZOMQUWVIVVKOI/bundle.json","state_url":"https://pith.science/pith/23J2LGEKOOQ2WZOMQUWVIVVKOI/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/23J2LGEKOOQ2WZOMQUWVIVVKOI/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-24T02:28:44Z","links":{"resolver":"https://pith.science/pith/23J2LGEKOOQ2WZOMQUWVIVVKOI","bundle":"https://pith.science/pith/23J2LGEKOOQ2WZOMQUWVIVVKOI/bundle.json","state":"https://pith.science/pith/23J2LGEKOOQ2WZOMQUWVIVVKOI/state.json","well_known_bundle":"https://pith.science/.well-known/pith/23J2LGEKOOQ2WZOMQUWVIVVKOI/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:23J2LGEKOOQ2WZOMQUWVIVVKOI","merge_version":"pith-open-graph-merge-v1","event_count":3,"valid_event_count":3,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"9fdc1d3d1bd380c0214ee6bf2cb1932af79419a7933be850497b565dc9bfeb89","cross_cats_sorted":["cs.MA"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T08:50:05Z","title_canon_sha256":"3a24ce8aed65cb35d8fad1a68935797323fb6d2b6763ded61970ac03e6bdc42e"},"schema_version":"1.0","source":{"id":"2605.15224","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15224","created_at":"2026-05-20T00:00:47Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15224v1","created_at":"2026-05-20T00:00:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.15224","created_at":"2026-05-20T00:00:47Z"},{"alias_kind":"pith_short_12","alias_value":"23J2LGEKOOQ2","created_at":"2026-05-20T00:00:47Z"},{"alias_kind":"pith_short_16","alias_value":"23J2LGEKOOQ2WZOM","created_at":"2026-05-20T00:00:47Z"},{"alias_kind":"pith_short_8","alias_value":"23J2LGEK","created_at":"2026-05-20T00:00:47Z"}],"graph_snapshots":[{"event_id":"sha256:643c5e56b599ed0ee040c443a14566dc0c3e9336c367d7aa49f694f6f0cabcdd","target":"graph","created_at":"2026-05-20T00:00:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"ICRL jointly trains a solver and a critic from a shared backbone to convert critique-induced success into unassisted solver ability, with results showing average gains of 6.4 points over GRPO on agentic tasks and 7.0 points on mathematical reasoning using Qwen3-4B and Qwen3-8B backbones."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The distribution-calibration re-weighting ratio successfully selects critique-guided improvements that remain compatible with the solver's own prompt distribution, without introducing bias or reducing performance on critique-free queries."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"ICRL uses joint RL training of solver and critic with distribution-calibration re-weighting and role-wise advantage estimation to internalize critique into unassisted LLM performance, yielding 6.4-point gains on agentic tasks and 7.0 on math reasoning with Qwen3 models."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"ICRL jointly trains a solver and critic from one backbone so critique gains become part of unassisted performance."}],"snapshot_sha256":"af6ba23ad3909c298836d3bae9c60967f95ccffccd2f0916260f6b843232e1c2"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"145503fd6ca4ffa37b9cfcda91b08ccba6b56dea5fcfd73b7e981c39b1642c3e"},"integrity":{"available":true,"clean":false,"detectors_run":[{"findings_count":0,"name":"claim_evidence","ran_at":"2026-05-19T20:01:57.074166Z","status":"completed","version":"1.0.0"},{"findings_count":1,"name":"doi_compliance","ran_at":"2026-05-19T18:01:24.690308Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T18:01:18.624674Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-19T13:33:22.834146Z","status":"skipped","version":"1.0.0"}],"endpoint":"/pith/2605.15224/integrity.json","findings":[{"audited_at":"2026-05-19T18:01:24.690308Z","detected_arxiv_id":null,"detected_doi":"10.1162/TACL\\_A\\_00475","detector":"doi_compliance","finding_type":"recoverable_identifier","note":"DOI in the printed bibliography is fragmented by whitespace or line breaks. A longer candidate (10.1162/TACL\\_A\\_00475) was visible in the surrounding text but could not be confirmed against doi.org as printed.","ref_index":28,"severity":"advisory","verdict_class":"incontrovertible"}],"snapshot_sha256":"f3643a4cdb68bb505c3764342fd5bf2145110ad50029bfc1ce18115a0c39c457","summary":{"advisory":1,"by_detector":{"doi_compliance":{"advisory":1,"critical":0,"informational":0,"total":1}},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Large language model-based agents make mistakes, yet critique can often guide the same model toward correct behavior. However, when critique is removed, the model may fail again on the same query, indicating that it has not internalized the critique's guidance into its underlying capability. Meanwhile, a frozen critic cannot improve its feedback quality over time, limiting the potential for iterative self-improvement. To address this, we propose learning to internalize self-critique with reinforcement learning(ICRL), a novel framework that jointly trains a solver and a critic from a shared bac","authors_text":"Chengwei Qin, Heqing Zou, Hui Xiong, Jianbo Lin, Weishi Wang, Xiaomin Yu, Yifu Guo, Yi Xin, Zhongqi Yue, Zhuosong Jiang","cross_cats":["cs.MA"],"headline":"ICRL jointly trains a solver and critic from one backbone so critique gains become part of unassisted performance.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T08:50:05Z","title":"ICRL: Learning to Internalize Self-Critique with Reinforcement Learning"},"references":{"count":45,"internal_anchors":12,"resolved_work":45,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Advances in neural information processing systems , volume=","work_id":"782a3ae1-f760-4d7d-98b8-85da324ebad6","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Advances in neural information processing systems , volume=","work_id":"07ff1f09-5238-40b1-9239-0e4afc5e9787","year":null},{"cited_arxiv_id":"2305.11738","doi":"","is_internal_anchor":true,"ref_index":3,"title":"CRITIC: Large Language Models Can Self-Correct with Tool-Interactive Critiquing","work_id":"dcdfa0ce-7ed4-4614-acd9-8bdbf491fc26","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection , author=. ArXiv , year=","work_id":"3c43afea-eb94-4455-88d0-374efd5460c9","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"arXiv preprint arXiv:2303.16755 , year=","work_id":"18748c48-f59c-4357-8a13-e3854e429670","year":null}],"snapshot_sha256":"5ef9a412375816af24c7a31bad52b7ee7876c64b938b29fcbe870abf0252d219"},"source":{"id":"2605.15224","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-19T17:53:19.786250Z","id":"13d6bd54-1a86-4c7b-97ae-b4aa295777c2","model_set":{"reader":"grok-4.3"},"one_line_summary":"ICRL uses joint RL training of solver and critic with distribution-calibration re-weighting and role-wise advantage estimation to internalize critique into unassisted LLM performance, yielding 6.4-point gains on agentic tasks and 7.0 on math reasoning with Qwen3 models.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"ICRL jointly trains a solver and critic from one backbone so critique gains become part of unassisted performance.","strongest_claim":"ICRL jointly trains a solver and a critic from a shared backbone to convert critique-induced success into unassisted solver ability, with results showing average gains of 6.4 points over GRPO on agentic tasks and 7.0 points on mathematical reasoning using Qwen3-4B and Qwen3-8B backbones.","weakest_assumption":"The distribution-calibration re-weighting ratio successfully selects critique-guided improvements that remain compatible with the solver's own prompt distribution, without introducing bias or reducing performance on critique-free queries."}},"verdict_id":"13d6bd54-1a86-4c7b-97ae-b4aa295777c2"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:400b52d659978990e18be74e966884e783b0480b82bb75c622c0a427ea8a5888","target":"record","created_at":"2026-05-20T00:00:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"9fdc1d3d1bd380c0214ee6bf2cb1932af79419a7933be850497b565dc9bfeb89","cross_cats_sorted":["cs.MA"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T08:50:05Z","title_canon_sha256":"3a24ce8aed65cb35d8fad1a68935797323fb6d2b6763ded61970ac03e6bdc42e"},"schema_version":"1.0","source":{"id":"2605.15224","kind":"arxiv","version":1}},"canonical_sha256":"d6d3a5988a73a1ab65cc852d5456aa722a1bbb89089139b366d27d177770154e","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"d6d3a5988a73a1ab65cc852d5456aa722a1bbb89089139b366d27d177770154e","first_computed_at":"2026-05-20T00:00:47.193945Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:00:47.193945Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"nWWTBevHMv5e9dpbZjExd1+XpcVOe4goXL24WRu09ZtoMqd2xbAqtK1KqENY1IuQQqTO4EAVXyCm8liyLNoMCA==","signature_status":"signed_v1","signed_at":"2026-05-20T00:00:47.194918Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.15224","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:ab2d50de2f4dba5863dbbf9d6ac957c85c8eb2493573615c6f8ddb09622d981f","sha256:400b52d659978990e18be74e966884e783b0480b82bb75c622c0a427ea8a5888","sha256:643c5e56b599ed0ee040c443a14566dc0c3e9336c367d7aa49f694f6f0cabcdd"],"state_sha256":"7097776aba138bcf56a65d5cb54899bd04b61fd9021aa197dc6480f4d86d699f"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"9aQAASBYLlyp53+eP38FB+1CXQzMQ5rPs4P8cfojapInvFWZ3MSHW7ZgjiPxtY/qsdTDDdxhqi+ZR+SV9j5bBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-24T02:28:44.707705Z","bundle_sha256":"c5e34107e52c4811202139936160f7c5ea4aa0a7a4198ec3259f878025316c6a"}}