{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:DAATHSCBPPTSQC6F3DVN7HDE2A","short_pith_number":"pith:DAATHSCB","canonical_record":{"source":{"id":"2506.14758","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-06-17T17:54:03Z","cross_cats_sorted":[],"title_canon_sha256":"034df4168332dcc08d8cea9f107cc98a5b3c3e9ff5e87576f78fdb2d99b5faf0","abstract_canon_sha256":"5bec794faeb56b17b0c7c956ea9d005937306fb3a32dab0f0ba478a155f70bf3"},"schema_version":"1.0"},"canonical_sha256":"180133c8417be7280bc5d8eadf9c64d018a7830496441949e808ed3313acc502","source":{"kind":"arxiv","id":"2506.14758","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2506.14758","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2506.14758v4","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2506.14758","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"DAATHSCBPPTS","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"DAATHSCBPPTSQC6F","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"DAATHSCB","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:DAATHSCBPPTSQC6F3DVN7HDE2A","target":"record","payload":{"canonical_record":{"source":{"id":"2506.14758","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-06-17T17:54:03Z","cross_cats_sorted":[],"title_canon_sha256":"034df4168332dcc08d8cea9f107cc98a5b3c3e9ff5e87576f78fdb2d99b5faf0","abstract_canon_sha256":"5bec794faeb56b17b0c7c956ea9d005937306fb3a32dab0f0ba478a155f70bf3"},"schema_version":"1.0"},"canonical_sha256":"180133c8417be7280bc5d8eadf9c64d018a7830496441949e808ed3313acc502","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:48.850010Z","signature_b64":"AlVoGmJk0Rp2SqTaSaYQpVUYzM1JcEDqhMrBW2WhI3up9vuYolMeS5j8nV9aMUiT2CZsQFQdFTUNP9+bs9qBAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"180133c8417be7280bc5d8eadf9c64d018a7830496441949e808ed3313acc502","last_reissued_at":"2026-05-17T23:38:48.849568Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:48.849568Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2506.14758","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"M8o/6edWw8Q2wiAO2T/EXGILmjTuUOO4MAwDvmq4+acNT+XwjEbS2Hz+8mqEkZKDjpmuVVV6V+wUTZReelw8BQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-22T21:01:24.144325Z"},"content_sha256":"4e1e3d2c1871839c0cfec9df94c7e5eee79a326ed068cf7f6625549c5eac37b5","schema_version":"1.0","event_id":"sha256:4e1e3d2c1871839c0cfec9df94c7e5eee79a326ed068cf7f6625549c5eac37b5"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:DAATHSCBPPTSQC6F3DVN7HDE2A","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Reasoning with Exploration: An Entropy Perspective","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Augmenting the RL advantage function with an entropy term improves LLM reasoning on Pass@K by encouraging longer exploratory chains.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Bo Dai, Daixuan Cheng, Furu Wei, Shaohan Huang, Wayne Xin Zhao, Xuekai Zhu, Zhenliang Zhang","submitted_at":"2025-06-17T17:54:03Z","abstract_excerpt":"Balancing exploration and exploitation is a central goal in reinforcement learning (RL). Despite recent advances in enhancing large language model (LLM) reasoning, most methods lean toward exploitation, and increasingly encounter performance plateaus. In this work, we revisit entropy -- a signal of exploration in RL -- and examine its relationship to exploratory reasoning in LLMs. Through empirical analysis, we uncover positive correlations between high-entropy regions and three types of exploratory reasoning actions: (1) pivotal tokens that determine or connect logical steps, (2) reflective a"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"our method achieves significant gains on the Pass@K metric -- an upper-bound estimator of LLM reasoning capabilities -- even when evaluated with extremely large K values, pushing the boundaries of LLM reasoning.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The observed positive correlations between high-entropy regions and beneficial exploratory actions (pivotal tokens, reflection, rare behaviors) will translate into improved downstream reasoning performance when the entropy term is added to the advantage function.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Augmenting the RL advantage with an entropy term promotes deeper LLM reasoning chains and raises Pass@K scores.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Augmenting the RL advantage function with an entropy term improves LLM reasoning on Pass@K by encouraging longer exploratory chains.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"a3afc3401fde5f9d05b268a7e41f9da27fefaeaff8b0a447c0741d38b7f98b00"},"source":{"id":"2506.14758","kind":"arxiv","version":4},"verdict":{"id":"18c9b05a-a8e2-40c4-b50c-66b778b5563c","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T06:16:18.830942Z","strongest_claim":"our method achieves significant gains on the Pass@K metric -- an upper-bound estimator of LLM reasoning capabilities -- even when evaluated with extremely large K values, pushing the boundaries of LLM reasoning.","one_line_summary":"Augmenting the RL advantage with an entropy term promotes deeper LLM reasoning chains and raises Pass@K scores.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The observed positive correlations between high-entropy regions and beneficial exploratory actions (pivotal tokens, reflection, rare behaviors) will translate into improved downstream reasoning performance when the entropy term is added to the advantage function.","pith_extraction_headline":"Augmenting the RL advantage function with an entropy term improves LLM reasoning on Pass@K by encouraging longer exploratory chains."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"7f64489350aa56258060107325552a54b65d97d7d4b28cdbdcd013d6d13f10be"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"18c9b05a-a8e2-40c4-b50c-66b778b5563c"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"RUl6UmWdugHg7mXI8xocG1nh+DLnk0pMPCB8ubJiPBgsd8FpMC2y4eE9S6UtVlde3SN2shvy2shlaXgYwlv+Ag==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-22T21:01:24.144840Z"},"content_sha256":"8d584e0ae40967a1d6f131db9753e5664a05accca6c2928159787b35a4fbf4b5","schema_version":"1.0","event_id":"sha256:8d584e0ae40967a1d6f131db9753e5664a05accca6c2928159787b35a4fbf4b5"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/DAATHSCBPPTSQC6F3DVN7HDE2A/bundle.json","state_url":"https://pith.science/pith/DAATHSCBPPTSQC6F3DVN7HDE2A/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/DAATHSCBPPTSQC6F3DVN7HDE2A/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-22T21:01:24Z","links":{"resolver":"https://pith.science/pith/DAATHSCBPPTSQC6F3DVN7HDE2A","bundle":"https://pith.science/pith/DAATHSCBPPTSQC6F3DVN7HDE2A/bundle.json","state":"https://pith.science/pith/DAATHSCBPPTSQC6F3DVN7HDE2A/state.json","well_known_bundle":"https://pith.science/.well-known/pith/DAATHSCBPPTSQC6F3DVN7HDE2A/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:DAATHSCBPPTSQC6F3DVN7HDE2A","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5bec794faeb56b17b0c7c956ea9d005937306fb3a32dab0f0ba478a155f70bf3","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-06-17T17:54:03Z","title_canon_sha256":"034df4168332dcc08d8cea9f107cc98a5b3c3e9ff5e87576f78fdb2d99b5faf0"},"schema_version":"1.0","source":{"id":"2506.14758","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2506.14758","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2506.14758v4","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2506.14758","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"DAATHSCBPPTS","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"DAATHSCBPPTSQC6F","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"DAATHSCB","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8d584e0ae40967a1d6f131db9753e5664a05accca6c2928159787b35a4fbf4b5","target":"graph","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"our method achieves significant gains on the Pass@K metric -- an upper-bound estimator of LLM reasoning capabilities -- even when evaluated with extremely large K values, pushing the boundaries of LLM reasoning."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The observed positive correlations between high-entropy regions and beneficial exploratory actions (pivotal tokens, reflection, rare behaviors) will translate into improved downstream reasoning performance when the entropy term is added to the advantage function."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Augmenting the RL advantage with an entropy term promotes deeper LLM reasoning chains and raises Pass@K scores."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Augmenting the RL advantage function with an entropy term improves LLM reasoning on Pass@K by encouraging longer exploratory chains."}],"snapshot_sha256":"a3afc3401fde5f9d05b268a7e41f9da27fefaeaff8b0a447c0741d38b7f98b00"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"7f64489350aa56258060107325552a54b65d97d7d4b28cdbdcd013d6d13f10be"},"paper":{"abstract_excerpt":"Balancing exploration and exploitation is a central goal in reinforcement learning (RL). Despite recent advances in enhancing large language model (LLM) reasoning, most methods lean toward exploitation, and increasingly encounter performance plateaus. In this work, we revisit entropy -- a signal of exploration in RL -- and examine its relationship to exploratory reasoning in LLMs. Through empirical analysis, we uncover positive correlations between high-entropy regions and three types of exploratory reasoning actions: (1) pivotal tokens that determine or connect logical steps, (2) reflective a","authors_text":"Bo Dai, Daixuan Cheng, Furu Wei, Shaohan Huang, Wayne Xin Zhao, Xuekai Zhu, Zhenliang Zhang","cross_cats":[],"headline":"Augmenting the RL advantage function with an entropy term improves LLM reasoning on Pass@K by encouraging longer exploratory chains.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-06-17T17:54:03Z","title":"Reasoning with Exploration: An Entropy Perspective"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2506.14758","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-16T06:16:18.830942Z","id":"18c9b05a-a8e2-40c4-b50c-66b778b5563c","model_set":{"reader":"grok-4.3"},"one_line_summary":"Augmenting the RL advantage with an entropy term promotes deeper LLM reasoning chains and raises Pass@K scores.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Augmenting the RL advantage function with an entropy term improves LLM reasoning on Pass@K by encouraging longer exploratory chains.","strongest_claim":"our method achieves significant gains on the Pass@K metric -- an upper-bound estimator of LLM reasoning capabilities -- even when evaluated with extremely large K values, pushing the boundaries of LLM reasoning.","weakest_assumption":"The observed positive correlations between high-entropy regions and beneficial exploratory actions (pivotal tokens, reflection, rare behaviors) will translate into improved downstream reasoning performance when the entropy term is added to the advantage function."}},"verdict_id":"18c9b05a-a8e2-40c4-b50c-66b778b5563c"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:4e1e3d2c1871839c0cfec9df94c7e5eee79a326ed068cf7f6625549c5eac37b5","target":"record","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5bec794faeb56b17b0c7c956ea9d005937306fb3a32dab0f0ba478a155f70bf3","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-06-17T17:54:03Z","title_canon_sha256":"034df4168332dcc08d8cea9f107cc98a5b3c3e9ff5e87576f78fdb2d99b5faf0"},"schema_version":"1.0","source":{"id":"2506.14758","kind":"arxiv","version":4}},"canonical_sha256":"180133c8417be7280bc5d8eadf9c64d018a7830496441949e808ed3313acc502","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"180133c8417be7280bc5d8eadf9c64d018a7830496441949e808ed3313acc502","first_computed_at":"2026-05-17T23:38:48.849568Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:48.849568Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"AlVoGmJk0Rp2SqTaSaYQpVUYzM1JcEDqhMrBW2WhI3up9vuYolMeS5j8nV9aMUiT2CZsQFQdFTUNP9+bs9qBAA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:48.850010Z","signed_message":"canonical_sha256_bytes"},"source_id":"2506.14758","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:4e1e3d2c1871839c0cfec9df94c7e5eee79a326ed068cf7f6625549c5eac37b5","sha256:8d584e0ae40967a1d6f131db9753e5664a05accca6c2928159787b35a4fbf4b5"],"state_sha256":"692bcc616247f1c0a0e78f4a464f57473732880beae2b7d6be480e0d7948bae2"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"53d3YDSDXrf2ijBGq7Y58cADcxCBe94eUgXz0PlNcWMs9g+EzhTq9ANcMyQxWUtkDjl2vMSviyD5D2cWfiuyBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-22T21:01:24.148034Z","bundle_sha256":"a512cd3733a62c2bfec51a3e540e1504c65c65a06bad466d5130a1f30cafda9e"}}