{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:JNRSKZFDN464UNAJYUJBLMC43F","short_pith_number":"pith:JNRSKZFD","canonical_record":{"source":{"id":"2603.19470","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-19T21:04:17Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"64b4efc2583088e55a949ca5f995dfda42d304439bbcd5b1fb46018925ba2b3a","abstract_canon_sha256":"6c69edb49c9a928a5773c5cc22ec3547f0d62b4ade877e63880c6c3555b5745a"},"schema_version":"1.0"},"canonical_sha256":"4b632564a36f3dca3409c51215b05cd967f49fd3f4f9ff48a7a86de0491adef1","source":{"kind":"arxiv","id":"2603.19470","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.19470","created_at":"2026-05-20T00:02:10Z"},{"alias_kind":"arxiv_version","alias_value":"2603.19470v3","created_at":"2026-05-20T00:02:10Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.19470","created_at":"2026-05-20T00:02:10Z"},{"alias_kind":"pith_short_12","alias_value":"JNRSKZFDN464","created_at":"2026-05-20T00:02:10Z"},{"alias_kind":"pith_short_16","alias_value":"JNRSKZFDN464UNAJ","created_at":"2026-05-20T00:02:10Z"},{"alias_kind":"pith_short_8","alias_value":"JNRSKZFD","created_at":"2026-05-20T00:02:10Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:JNRSKZFDN464UNAJYUJBLMC43F","target":"record","payload":{"canonical_record":{"source":{"id":"2603.19470","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-19T21:04:17Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"64b4efc2583088e55a949ca5f995dfda42d304439bbcd5b1fb46018925ba2b3a","abstract_canon_sha256":"6c69edb49c9a928a5773c5cc22ec3547f0d62b4ade877e63880c6c3555b5745a"},"schema_version":"1.0"},"canonical_sha256":"4b632564a36f3dca3409c51215b05cd967f49fd3f4f9ff48a7a86de0491adef1","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:02:10.768196Z","signature_b64":"1Xw3BMUeV6VK67lEUAaappshrAmAtb3a+fAiGUgKeQuNQNGKEtKBNGQXy551k1yvyhOSoR+5CXhISTsfPK9yAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4b632564a36f3dca3409c51215b05cd967f49fd3f4f9ff48a7a86de0491adef1","last_reissued_at":"2026-05-20T00:02:10.767538Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:02:10.767538Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2603.19470","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:02:10Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"4/kUxq5I75U8eE7R+OyrL/TAea1XoJhrOzcIjHpey/wqpfGh8cGSBnY/4Z12rl5WWGlQ1bO03mrAU8EeKST7AA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T22:39:11.034659Z"},"content_sha256":"b24986aedd4abc8e6be4b1822928baaf2495b8911d28d851bde3336f23b299e7","schema_version":"1.0","event_id":"sha256:b24986aedd4abc8e6be4b1822928baaf2495b8911d28d851bde3336f23b299e7"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:JNRSKZFDN464UNAJYUJBLMC43F","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Adaptive Layerwise Perturbation: Unifying Off-Policy Corrections for LLM RL","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Injecting learnable perturbations into hidden states of each layer stabilizes LLM reinforcement learning by flattening policy distributions and reducing importance ratio tails.","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Abhinav Gullapalli, Chenlu Ye, Hao Chen, Jing Huang, Tong Zhang, Xuanchang Zhang, Yifan Hao, Zhou Yu, Ziji Zhang","submitted_at":"2026-03-19T21:04:17Z","abstract_excerpt":"Off-policy problems such as policy staleness and training--inference mismatch have become a major bottleneck for training stability and further exploration in LLM RL. The distribution gap between the inference and updated policies grows because of the techniques to enhance inference efficiency, leading to heavy-tailed importance ratios. Heavy-tailed ratios arise when the policy is locally sharp, which further inflates gradients and can push updates outside the trust region. To address this, we propose Adaptive Layerwise Perturbation (ALP), which injects small learnable perturbations into the i"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"ALP prevents the updated policy from deviating too sharply from the inference policy and enlarges the policy family to cover inference-time mismatch noise, thereby maintaining training stability and improving performance on math and tool tasks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That small learnable perturbations added to intermediate hidden states will reliably flatten the policy distribution and reduce importance-ratio tails without introducing new instabilities or degrading the quality of the learned policy.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"ALP adds learnable perturbations to layer hidden states to flatten policy distributions and stabilize off-policy RL training for LLMs.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Injecting learnable perturbations into hidden states of each layer stabilizes LLM reinforcement learning by flattening policy distributions and reducing importance ratio tails.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"27cdfa7f5ddce88b48f39742513e8c19fa884a1d9a9b0dcd5bdf94c444d94b89"},"source":{"id":"2603.19470","kind":"arxiv","version":3},"verdict":{"id":"d84bd88f-c6b4-4462-8fca-7cce2fb799d8","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T08:01:49.847475Z","strongest_claim":"ALP prevents the updated policy from deviating too sharply from the inference policy and enlarges the policy family to cover inference-time mismatch noise, thereby maintaining training stability and improving performance on math and tool tasks.","one_line_summary":"ALP adds learnable perturbations to layer hidden states to flatten policy distributions and stabilize off-policy RL training for LLMs.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That small learnable perturbations added to intermediate hidden states will reliably flatten the policy distribution and reduce importance-ratio tails without introducing new instabilities or degrading the quality of the learned policy.","pith_extraction_headline":"Injecting learnable perturbations into hidden states of each layer stabilizes LLM reinforcement learning by flattening policy distributions and reducing importance ratio tails."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2603.19470/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"29f9106f61b463753a0770598828d4a011ef1b4936eb6fc44e071656ad96d2aa"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"d84bd88f-c6b4-4462-8fca-7cce2fb799d8"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:02:10Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"eiJ6/r0uuW5Qo3NL5WgvJ+zUTbWczQruQvO/2B/ve2nuUn4fe869aAjy0nsCEp0yeIT1ndfciKwYecIbE7PHAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T22:39:11.035425Z"},"content_sha256":"9b59c4a81fa39819cbc8973cc1f56e63f4edfb8f80c3e73dbad87afa1975c301","schema_version":"1.0","event_id":"sha256:9b59c4a81fa39819cbc8973cc1f56e63f4edfb8f80c3e73dbad87afa1975c301"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/JNRSKZFDN464UNAJYUJBLMC43F/bundle.json","state_url":"https://pith.science/pith/JNRSKZFDN464UNAJYUJBLMC43F/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/JNRSKZFDN464UNAJYUJBLMC43F/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-02T22:39:11Z","links":{"resolver":"https://pith.science/pith/JNRSKZFDN464UNAJYUJBLMC43F","bundle":"https://pith.science/pith/JNRSKZFDN464UNAJYUJBLMC43F/bundle.json","state":"https://pith.science/pith/JNRSKZFDN464UNAJYUJBLMC43F/state.json","well_known_bundle":"https://pith.science/.well-known/pith/JNRSKZFDN464UNAJYUJBLMC43F/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:JNRSKZFDN464UNAJYUJBLMC43F","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"6c69edb49c9a928a5773c5cc22ec3547f0d62b4ade877e63880c6c3555b5745a","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-19T21:04:17Z","title_canon_sha256":"64b4efc2583088e55a949ca5f995dfda42d304439bbcd5b1fb46018925ba2b3a"},"schema_version":"1.0","source":{"id":"2603.19470","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.19470","created_at":"2026-05-20T00:02:10Z"},{"alias_kind":"arxiv_version","alias_value":"2603.19470v3","created_at":"2026-05-20T00:02:10Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.19470","created_at":"2026-05-20T00:02:10Z"},{"alias_kind":"pith_short_12","alias_value":"JNRSKZFDN464","created_at":"2026-05-20T00:02:10Z"},{"alias_kind":"pith_short_16","alias_value":"JNRSKZFDN464UNAJ","created_at":"2026-05-20T00:02:10Z"},{"alias_kind":"pith_short_8","alias_value":"JNRSKZFD","created_at":"2026-05-20T00:02:10Z"}],"graph_snapshots":[{"event_id":"sha256:9b59c4a81fa39819cbc8973cc1f56e63f4edfb8f80c3e73dbad87afa1975c301","target":"graph","created_at":"2026-05-20T00:02:10Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"ALP prevents the updated policy from deviating too sharply from the inference policy and enlarges the policy family to cover inference-time mismatch noise, thereby maintaining training stability and improving performance on math and tool tasks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That small learnable perturbations added to intermediate hidden states will reliably flatten the policy distribution and reduce importance-ratio tails without introducing new instabilities or degrading the quality of the learned policy."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"ALP adds learnable perturbations to layer hidden states to flatten policy distributions and stabilize off-policy RL training for LLMs."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Injecting learnable perturbations into hidden states of each layer stabilizes LLM reinforcement learning by flattening policy distributions and reducing importance ratio tails."}],"snapshot_sha256":"27cdfa7f5ddce88b48f39742513e8c19fa884a1d9a9b0dcd5bdf94c444d94b89"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"29f9106f61b463753a0770598828d4a011ef1b4936eb6fc44e071656ad96d2aa"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2603.19470/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Off-policy problems such as policy staleness and training--inference mismatch have become a major bottleneck for training stability and further exploration in LLM RL. The distribution gap between the inference and updated policies grows because of the techniques to enhance inference efficiency, leading to heavy-tailed importance ratios. Heavy-tailed ratios arise when the policy is locally sharp, which further inflates gradients and can push updates outside the trust region. To address this, we propose Adaptive Layerwise Perturbation (ALP), which injects small learnable perturbations into the i","authors_text":"Abhinav Gullapalli, Chenlu Ye, Hao Chen, Jing Huang, Tong Zhang, Xuanchang Zhang, Yifan Hao, Zhou Yu, Ziji Zhang","cross_cats":["cs.AI"],"headline":"Injecting learnable perturbations into hidden states of each layer stabilizes LLM reinforcement learning by flattening policy distributions and reducing importance ratio tails.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-19T21:04:17Z","title":"Adaptive Layerwise Perturbation: Unifying Off-Policy Corrections for LLM RL"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.19470","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-15T08:01:49.847475Z","id":"d84bd88f-c6b4-4462-8fca-7cce2fb799d8","model_set":{"reader":"grok-4.3"},"one_line_summary":"ALP adds learnable perturbations to layer hidden states to flatten policy distributions and stabilize off-policy RL training for LLMs.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Injecting learnable perturbations into hidden states of each layer stabilizes LLM reinforcement learning by flattening policy distributions and reducing importance ratio tails.","strongest_claim":"ALP prevents the updated policy from deviating too sharply from the inference policy and enlarges the policy family to cover inference-time mismatch noise, thereby maintaining training stability and improving performance on math and tool tasks.","weakest_assumption":"That small learnable perturbations added to intermediate hidden states will reliably flatten the policy distribution and reduce importance-ratio tails without introducing new instabilities or degrading the quality of the learned policy."}},"verdict_id":"d84bd88f-c6b4-4462-8fca-7cce2fb799d8"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:b24986aedd4abc8e6be4b1822928baaf2495b8911d28d851bde3336f23b299e7","target":"record","created_at":"2026-05-20T00:02:10Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"6c69edb49c9a928a5773c5cc22ec3547f0d62b4ade877e63880c6c3555b5745a","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-19T21:04:17Z","title_canon_sha256":"64b4efc2583088e55a949ca5f995dfda42d304439bbcd5b1fb46018925ba2b3a"},"schema_version":"1.0","source":{"id":"2603.19470","kind":"arxiv","version":3}},"canonical_sha256":"4b632564a36f3dca3409c51215b05cd967f49fd3f4f9ff48a7a86de0491adef1","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"4b632564a36f3dca3409c51215b05cd967f49fd3f4f9ff48a7a86de0491adef1","first_computed_at":"2026-05-20T00:02:10.767538Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:02:10.767538Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"1Xw3BMUeV6VK67lEUAaappshrAmAtb3a+fAiGUgKeQuNQNGKEtKBNGQXy551k1yvyhOSoR+5CXhISTsfPK9yAw==","signature_status":"signed_v1","signed_at":"2026-05-20T00:02:10.768196Z","signed_message":"canonical_sha256_bytes"},"source_id":"2603.19470","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:b24986aedd4abc8e6be4b1822928baaf2495b8911d28d851bde3336f23b299e7","sha256:9b59c4a81fa39819cbc8973cc1f56e63f4edfb8f80c3e73dbad87afa1975c301"],"state_sha256":"90c4eade228a619d68d1c9bf547fe48cc63bb8cda3fa806f27e1de9e0a4bcf7a"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"5yrisjWXGMh99IMq1aZlqvSvd1j00TAEo3ubwdFurT1hZm7kkSV/YIPoaIk/+EvV73LxQBa2P8QL1Ss0WWCLAQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-02T22:39:11.038571Z","bundle_sha256":"a5b02a60ec6c8cc6d48ae717f91707cdb9e43d03df508d07b018906ccb9a4485"}}