{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:PWK3NMHMKKFQRA4TWJJIPVKOJA","short_pith_number":"pith:PWK3NMHM","schema_version":"1.0","canonical_sha256":"7d95b6b0ec528b088393b25287d54e48330846a512da1310d6cea4c60316aabb","source":{"kind":"arxiv","id":"2606.03021","version":1},"attestation_state":"computed","paper":{"title":"Hint-Guided Diversified Policy Optimization for LLM Reasoning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Can Ye, Kaixin Wu, Mingjie Zhong, Peifeng Li, Qiaoming Zhu, Xiaobo Li, Zhiyu Cao","submitted_at":"2026-06-02T01:55:54Z","abstract_excerpt":"Recent developments in Large Language Models (LLMs) have showcased impressive reasoning capabilities, with Reinforcement Learning with Verifiable Rewards (RLVR) being a promising enhancement strategy. However, existing reward mechanisms are constrained to the outcome-level correctness and lack explicit signals to guide the model to consider diverse solutions. In contrast, human problem solving typically involves evaluating multiple potential approaches and selecting the most reliable solution, a cognitive process that current RLVR frameworks do not explicitly incentivize. Inspired by this, we "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.03021","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-02T01:55:54Z","cross_cats_sorted":[],"title_canon_sha256":"e7ae8e42101178225bf375156f86c5b3011a133635405a127d3cfbb9ecd3ff53","abstract_canon_sha256":"7c11bf83a396d8841ff5c3fbcd213c11c4fc4cef432a2dfe8da6856e1cbb74f4"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-03T01:05:29.464783Z","signature_b64":"NlpkXXi25Ab+L98Qn17Mr690GAsCOQFVyy5iDk/hZNR+8DRGdTkbfTFR6O/E5/F2xg75699J5MxARyLNK2t9CA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7d95b6b0ec528b088393b25287d54e48330846a512da1310d6cea4c60316aabb","last_reissued_at":"2026-06-03T01:05:29.464344Z","signature_status":"signed_v1","first_computed_at":"2026-06-03T01:05:29.464344Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Hint-Guided Diversified Policy Optimization for LLM Reasoning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Can Ye, Kaixin Wu, Mingjie Zhong, Peifeng Li, Qiaoming Zhu, Xiaobo Li, Zhiyu Cao","submitted_at":"2026-06-02T01:55:54Z","abstract_excerpt":"Recent developments in Large Language Models (LLMs) have showcased impressive reasoning capabilities, with Reinforcement Learning with Verifiable Rewards (RLVR) being a promising enhancement strategy. However, existing reward mechanisms are constrained to the outcome-level correctness and lack explicit signals to guide the model to consider diverse solutions. In contrast, human problem solving typically involves evaluating multiple potential approaches and selecting the most reliable solution, a cognitive process that current RLVR frameworks do not explicitly incentivize. Inspired by this, we "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.03021","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.03021/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.03021","created_at":"2026-06-03T01:05:29.464407+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.03021v1","created_at":"2026-06-03T01:05:29.464407+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.03021","created_at":"2026-06-03T01:05:29.464407+00:00"},{"alias_kind":"pith_short_12","alias_value":"PWK3NMHMKKFQ","created_at":"2026-06-03T01:05:29.464407+00:00"},{"alias_kind":"pith_short_16","alias_value":"PWK3NMHMKKFQRA4T","created_at":"2026-06-03T01:05:29.464407+00:00"},{"alias_kind":"pith_short_8","alias_value":"PWK3NMHM","created_at":"2026-06-03T01:05:29.464407+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PWK3NMHMKKFQRA4TWJJIPVKOJA","json":"https://pith.science/pith/PWK3NMHMKKFQRA4TWJJIPVKOJA.json","graph_json":"https://pith.science/api/pith-number/PWK3NMHMKKFQRA4TWJJIPVKOJA/graph.json","events_json":"https://pith.science/api/pith-number/PWK3NMHMKKFQRA4TWJJIPVKOJA/events.json","paper":"https://pith.science/paper/PWK3NMHM"},"agent_actions":{"view_html":"https://pith.science/pith/PWK3NMHMKKFQRA4TWJJIPVKOJA","download_json":"https://pith.science/pith/PWK3NMHMKKFQRA4TWJJIPVKOJA.json","view_paper":"https://pith.science/paper/PWK3NMHM","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.03021&json=true","fetch_graph":"https://pith.science/api/pith-number/PWK3NMHMKKFQRA4TWJJIPVKOJA/graph.json","fetch_events":"https://pith.science/api/pith-number/PWK3NMHMKKFQRA4TWJJIPVKOJA/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PWK3NMHMKKFQRA4TWJJIPVKOJA/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PWK3NMHMKKFQRA4TWJJIPVKOJA/action/storage_attestation","attest_author":"https://pith.science/pith/PWK3NMHMKKFQRA4TWJJIPVKOJA/action/author_attestation","sign_citation":"https://pith.science/pith/PWK3NMHMKKFQRA4TWJJIPVKOJA/action/citation_signature","submit_replication":"https://pith.science/pith/PWK3NMHMKKFQRA4TWJJIPVKOJA/action/replication_record"}},"created_at":"2026-06-03T01:05:29.464407+00:00","updated_at":"2026-06-03T01:05:29.464407+00:00"}