{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:SNFFGAUG5VHQNIMTPVUMB73G54","short_pith_number":"pith:SNFFGAUG","schema_version":"1.0","canonical_sha256":"934a530286ed4f06a1937d68c0ff66ef19982798bb64d29d8b351de5d7e1ef74","source":{"kind":"arxiv","id":"2605.20256","version":1},"attestation_state":"computed","paper":{"title":"FBOS-RL: Feedback-Driven Bi-Objective Synergistic Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Likang Xiao, Liu Liu, Peng Jiang, Quan Chen, Wenjun Wu, Xikai Zhang, Yanhua Cheng, Yingze Zhang, Yongzhi Li","submitted_at":"2026-05-18T12:48:36Z","abstract_excerpt":"Reinforcement learning has become a cornerstone for aligning and unlocking the reasoning capabilities of large-scale models. At its core, the training loop of GRPO and its variants alternates between rollout sampling and policy update. Unlike supervised learning, where each gradient step is anchored to an explicit ground-truth target, the optimal gradient direction for updating model parameters in this setting is not known a priori; the high-quality rollouts drawn during the sampling stage therefore act as the implicit \"teacher\" that guides every parameter update. However, GRPO adopt a simple "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.20256","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-18T12:48:36Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"49d68767a893b79144f7e0e4f63473c98d96f85600129898db1d675af3834b9b","abstract_canon_sha256":"035fb1f5c35fce336ae12c5e37cc331cb4822a36bb41b0f90dbce765fee97c3e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T00:04:22.924027Z","signature_b64":"RP7PZ7sI4ztrhUkCF9Q35f8EFGcwM/eQh3sgBxUi+D2E8ccTovlVH+nXI/hA1QMgNa02+MXbUn4tPwJz3L3DBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"934a530286ed4f06a1937d68c0ff66ef19982798bb64d29d8b351de5d7e1ef74","last_reissued_at":"2026-05-21T00:04:22.923542Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T00:04:22.923542Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"FBOS-RL: Feedback-Driven Bi-Objective Synergistic Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Likang Xiao, Liu Liu, Peng Jiang, Quan Chen, Wenjun Wu, Xikai Zhang, Yanhua Cheng, Yingze Zhang, Yongzhi Li","submitted_at":"2026-05-18T12:48:36Z","abstract_excerpt":"Reinforcement learning has become a cornerstone for aligning and unlocking the reasoning capabilities of large-scale models. At its core, the training loop of GRPO and its variants alternates between rollout sampling and policy update. Unlike supervised learning, where each gradient step is anchored to an explicit ground-truth target, the optimal gradient direction for updating model parameters in this setting is not known a priori; the high-quality rollouts drawn during the sampling stage therefore act as the implicit \"teacher\" that guides every parameter update. However, GRPO adopt a simple "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.20256","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.20256/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.20256","created_at":"2026-05-21T00:04:22.923614+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.20256v1","created_at":"2026-05-21T00:04:22.923614+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.20256","created_at":"2026-05-21T00:04:22.923614+00:00"},{"alias_kind":"pith_short_12","alias_value":"SNFFGAUG5VHQ","created_at":"2026-05-21T00:04:22.923614+00:00"},{"alias_kind":"pith_short_16","alias_value":"SNFFGAUG5VHQNIMT","created_at":"2026-05-21T00:04:22.923614+00:00"},{"alias_kind":"pith_short_8","alias_value":"SNFFGAUG","created_at":"2026-05-21T00:04:22.923614+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/SNFFGAUG5VHQNIMTPVUMB73G54","json":"https://pith.science/pith/SNFFGAUG5VHQNIMTPVUMB73G54.json","graph_json":"https://pith.science/api/pith-number/SNFFGAUG5VHQNIMTPVUMB73G54/graph.json","events_json":"https://pith.science/api/pith-number/SNFFGAUG5VHQNIMTPVUMB73G54/events.json","paper":"https://pith.science/paper/SNFFGAUG"},"agent_actions":{"view_html":"https://pith.science/pith/SNFFGAUG5VHQNIMTPVUMB73G54","download_json":"https://pith.science/pith/SNFFGAUG5VHQNIMTPVUMB73G54.json","view_paper":"https://pith.science/paper/SNFFGAUG","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.20256&json=true","fetch_graph":"https://pith.science/api/pith-number/SNFFGAUG5VHQNIMTPVUMB73G54/graph.json","fetch_events":"https://pith.science/api/pith-number/SNFFGAUG5VHQNIMTPVUMB73G54/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/SNFFGAUG5VHQNIMTPVUMB73G54/action/timestamp_anchor","attest_storage":"https://pith.science/pith/SNFFGAUG5VHQNIMTPVUMB73G54/action/storage_attestation","attest_author":"https://pith.science/pith/SNFFGAUG5VHQNIMTPVUMB73G54/action/author_attestation","sign_citation":"https://pith.science/pith/SNFFGAUG5VHQNIMTPVUMB73G54/action/citation_signature","submit_replication":"https://pith.science/pith/SNFFGAUG5VHQNIMTPVUMB73G54/action/replication_record"}},"created_at":"2026-05-21T00:04:22.923614+00:00","updated_at":"2026-05-21T00:04:22.923614+00:00"}