{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:OQOO75CB2EXUFQZBNSUDGQQNXB","short_pith_number":"pith:OQOO75CB","canonical_record":{"source":{"id":"2601.21484","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-29T10:06:52Z","cross_cats_sorted":[],"title_canon_sha256":"4d007177e9e248bbe1695479fc361a14ae94f9d4e6133f3af6b2bf9dac369cff","abstract_canon_sha256":"ec86568629ee249a0e9eb8b9c23ed7b5ad0b7ac0449eed7b9729614d306082f9"},"schema_version":"1.0"},"canonical_sha256":"741ceff441d12f42c3216ca833420db8509f0e7658d8c4506b84587f5585c154","source":{"kind":"arxiv","id":"2601.21484","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.21484","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"arxiv_version","alias_value":"2601.21484v3","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.21484","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"pith_short_12","alias_value":"OQOO75CB2EXU","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"pith_short_16","alias_value":"OQOO75CB2EXUFQZB","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"pith_short_8","alias_value":"OQOO75CB","created_at":"2026-05-20T01:05:07Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:OQOO75CB2EXUFQZBNSUDGQQNXB","target":"record","payload":{"canonical_record":{"source":{"id":"2601.21484","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-29T10:06:52Z","cross_cats_sorted":[],"title_canon_sha256":"4d007177e9e248bbe1695479fc361a14ae94f9d4e6133f3af6b2bf9dac369cff","abstract_canon_sha256":"ec86568629ee249a0e9eb8b9c23ed7b5ad0b7ac0449eed7b9729614d306082f9"},"schema_version":"1.0"},"canonical_sha256":"741ceff441d12f42c3216ca833420db8509f0e7658d8c4506b84587f5585c154","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T01:05:07.226822Z","signature_b64":"WU59kPoWryPNsEIMHqhAoVQFhVOOYX5QNNL5GFpKChNavTfJsqgmSiEm5RxH1tZg+ke0/nPJke4jIV9mu5KTDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"741ceff441d12f42c3216ca833420db8509f0e7658d8c4506b84587f5585c154","last_reissued_at":"2026-05-20T01:05:07.225942Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T01:05:07.225942Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2601.21484","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:05:07Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"0rgezadYW7E37VHqd4miGQlqHcsDKj4ZHrfXdrZOAKyVO9eDiYLc95OPr6+lzQ6IsXMe+TMqh6yge7nVVp/1AA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T21:53:53.374104Z"},"content_sha256":"921b9d6877349115d3785b3573113b47dcd44b1c1b0e862ae0e1f234776cb69c","schema_version":"1.0","event_id":"sha256:921b9d6877349115d3785b3573113b47dcd44b1c1b0e862ae0e1f234776cb69c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:OQOO75CB2EXUFQZBNSUDGQQNXB","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"ETS: Energy-Guided Test-Time Scaling for Training-Free RL Alignment","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Energy-guided test-time scaling samples directly from the optimal RL policy without any training.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Jinkai Zhang, Ju Fan, Longqiang Wang, Mingyang Yi, Xiuyu Li, Yue Wang, Yu Li","submitted_at":"2026-01-29T10:06:52Z","abstract_excerpt":"Reinforcement Learning (RL) post-training alignment for language models is effective, but also costly and unstable in practice, owing to its complicated training process. To address this, we propose a training-free inference method to sample directly from the optimal RL policy. The transition probability applied to Masked Language Modeling (MLM) consists of a reference policy model and an energy term. Based on this, our algorithm, Energy-Guided Test-Time Scaling (ETS), estimates the key energy term via online Monte Carlo, with a provable convergence rate. Moreover, to ensure practical efficien"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our algorithm, Energy-Guided Test-Time Scaling (ETS), estimates the key energy term via online Monte Carlo, with a provable convergence rate. Moreover, to ensure practical efficiency, ETS leverages modern acceleration frameworks alongside tailored importance sampling estimators, substantially reducing inference latency while provably preserving sampling quality.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The energy term derived from the reference policy and optimal RL policy can be estimated accurately enough via online Monte Carlo to approximate the target distribution without introducing substantial bias or requiring post-hoc adjustments that affect the claimed convergence.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"ETS enables direct sampling from the optimal RL policy for language models at inference time by estimating the energy term with online Monte Carlo and acceleration techniques.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Energy-guided test-time scaling samples directly from the optimal RL policy without any training.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"bac0fd335102a5519cd7fbad9136d8ba409469bd9afe28a1d18e052428894a81"},"source":{"id":"2601.21484","kind":"arxiv","version":3},"verdict":{"id":"da700276-d84f-4a70-8e50-7b462aafcb21","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T09:35:06.013193Z","strongest_claim":"Our algorithm, Energy-Guided Test-Time Scaling (ETS), estimates the key energy term via online Monte Carlo, with a provable convergence rate. Moreover, to ensure practical efficiency, ETS leverages modern acceleration frameworks alongside tailored importance sampling estimators, substantially reducing inference latency while provably preserving sampling quality.","one_line_summary":"ETS enables direct sampling from the optimal RL policy for language models at inference time by estimating the energy term with online Monte Carlo and acceleration techniques.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The energy term derived from the reference policy and optimal RL policy can be estimated accurately enough via online Monte Carlo to approximate the target distribution without introducing substantial bias or requiring post-hoc adjustments that affect the claimed convergence.","pith_extraction_headline":"Energy-guided test-time scaling samples directly from the optimal RL policy without any training."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.21484/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"db74b652bd7debe85b3b8283c53984fded143c8dd731214b5f494aff7cf9a15b"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"da700276-d84f-4a70-8e50-7b462aafcb21"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:05:07Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"tjCt0US0s/uQvhe3ICZcURme96UZMCJ9Ppx363crC1iQqmgdyoGxBDeT3huldC9yPlJANbQD0J12NajOgzarAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T21:53:53.374990Z"},"content_sha256":"2ba8aba4bccf709014934c8eda407038fb83377043cadb2030a820ebe8ee48f4","schema_version":"1.0","event_id":"sha256:2ba8aba4bccf709014934c8eda407038fb83377043cadb2030a820ebe8ee48f4"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/OQOO75CB2EXUFQZBNSUDGQQNXB/bundle.json","state_url":"https://pith.science/pith/OQOO75CB2EXUFQZBNSUDGQQNXB/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/OQOO75CB2EXUFQZBNSUDGQQNXB/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T21:53:53Z","links":{"resolver":"https://pith.science/pith/OQOO75CB2EXUFQZBNSUDGQQNXB","bundle":"https://pith.science/pith/OQOO75CB2EXUFQZBNSUDGQQNXB/bundle.json","state":"https://pith.science/pith/OQOO75CB2EXUFQZBNSUDGQQNXB/state.json","well_known_bundle":"https://pith.science/.well-known/pith/OQOO75CB2EXUFQZBNSUDGQQNXB/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:OQOO75CB2EXUFQZBNSUDGQQNXB","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"ec86568629ee249a0e9eb8b9c23ed7b5ad0b7ac0449eed7b9729614d306082f9","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-29T10:06:52Z","title_canon_sha256":"4d007177e9e248bbe1695479fc361a14ae94f9d4e6133f3af6b2bf9dac369cff"},"schema_version":"1.0","source":{"id":"2601.21484","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.21484","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"arxiv_version","alias_value":"2601.21484v3","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.21484","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"pith_short_12","alias_value":"OQOO75CB2EXU","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"pith_short_16","alias_value":"OQOO75CB2EXUFQZB","created_at":"2026-05-20T01:05:07Z"},{"alias_kind":"pith_short_8","alias_value":"OQOO75CB","created_at":"2026-05-20T01:05:07Z"}],"graph_snapshots":[{"event_id":"sha256:2ba8aba4bccf709014934c8eda407038fb83377043cadb2030a820ebe8ee48f4","target":"graph","created_at":"2026-05-20T01:05:07Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Our algorithm, Energy-Guided Test-Time Scaling (ETS), estimates the key energy term via online Monte Carlo, with a provable convergence rate. Moreover, to ensure practical efficiency, ETS leverages modern acceleration frameworks alongside tailored importance sampling estimators, substantially reducing inference latency while provably preserving sampling quality."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The energy term derived from the reference policy and optimal RL policy can be estimated accurately enough via online Monte Carlo to approximate the target distribution without introducing substantial bias or requiring post-hoc adjustments that affect the claimed convergence."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"ETS enables direct sampling from the optimal RL policy for language models at inference time by estimating the energy term with online Monte Carlo and acceleration techniques."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Energy-guided test-time scaling samples directly from the optimal RL policy without any training."}],"snapshot_sha256":"bac0fd335102a5519cd7fbad9136d8ba409469bd9afe28a1d18e052428894a81"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"db74b652bd7debe85b3b8283c53984fded143c8dd731214b5f494aff7cf9a15b"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2601.21484/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Reinforcement Learning (RL) post-training alignment for language models is effective, but also costly and unstable in practice, owing to its complicated training process. To address this, we propose a training-free inference method to sample directly from the optimal RL policy. The transition probability applied to Masked Language Modeling (MLM) consists of a reference policy model and an energy term. Based on this, our algorithm, Energy-Guided Test-Time Scaling (ETS), estimates the key energy term via online Monte Carlo, with a provable convergence rate. Moreover, to ensure practical efficien","authors_text":"Jinkai Zhang, Ju Fan, Longqiang Wang, Mingyang Yi, Xiuyu Li, Yue Wang, Yu Li","cross_cats":[],"headline":"Energy-guided test-time scaling samples directly from the optimal RL policy without any training.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-29T10:06:52Z","title":"ETS: Energy-Guided Test-Time Scaling for Training-Free RL Alignment"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.21484","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-16T09:35:06.013193Z","id":"da700276-d84f-4a70-8e50-7b462aafcb21","model_set":{"reader":"grok-4.3"},"one_line_summary":"ETS enables direct sampling from the optimal RL policy for language models at inference time by estimating the energy term with online Monte Carlo and acceleration techniques.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Energy-guided test-time scaling samples directly from the optimal RL policy without any training.","strongest_claim":"Our algorithm, Energy-Guided Test-Time Scaling (ETS), estimates the key energy term via online Monte Carlo, with a provable convergence rate. Moreover, to ensure practical efficiency, ETS leverages modern acceleration frameworks alongside tailored importance sampling estimators, substantially reducing inference latency while provably preserving sampling quality.","weakest_assumption":"The energy term derived from the reference policy and optimal RL policy can be estimated accurately enough via online Monte Carlo to approximate the target distribution without introducing substantial bias or requiring post-hoc adjustments that affect the claimed convergence."}},"verdict_id":"da700276-d84f-4a70-8e50-7b462aafcb21"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:921b9d6877349115d3785b3573113b47dcd44b1c1b0e862ae0e1f234776cb69c","target":"record","created_at":"2026-05-20T01:05:07Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"ec86568629ee249a0e9eb8b9c23ed7b5ad0b7ac0449eed7b9729614d306082f9","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-29T10:06:52Z","title_canon_sha256":"4d007177e9e248bbe1695479fc361a14ae94f9d4e6133f3af6b2bf9dac369cff"},"schema_version":"1.0","source":{"id":"2601.21484","kind":"arxiv","version":3}},"canonical_sha256":"741ceff441d12f42c3216ca833420db8509f0e7658d8c4506b84587f5585c154","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"741ceff441d12f42c3216ca833420db8509f0e7658d8c4506b84587f5585c154","first_computed_at":"2026-05-20T01:05:07.225942Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T01:05:07.225942Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"WU59kPoWryPNsEIMHqhAoVQFhVOOYX5QNNL5GFpKChNavTfJsqgmSiEm5RxH1tZg+ke0/nPJke4jIV9mu5KTDw==","signature_status":"signed_v1","signed_at":"2026-05-20T01:05:07.226822Z","signed_message":"canonical_sha256_bytes"},"source_id":"2601.21484","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:921b9d6877349115d3785b3573113b47dcd44b1c1b0e862ae0e1f234776cb69c","sha256:2ba8aba4bccf709014934c8eda407038fb83377043cadb2030a820ebe8ee48f4"],"state_sha256":"dc531a792a12f8efad28dd81296f9a4467d0e58d40e260a7332a0f8839abf50c"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"UUzlw8SJbhLgzl4IIGoda+8eUeIMUpY2gLbxNktrvedTUiM5lyZMKrdXkTcvK5DrFxiGT7Gukvw0Sf/m+XmRAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T21:53:53.378766Z","bundle_sha256":"e251aa62de35ebbdf47ff7bd39fda35985f59227cb2697b470f321ce8ab197c3"}}