{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:PHKD6RXN32BU63J7BVY36ELX5Z","short_pith_number":"pith:PHKD6RXN","canonical_record":{"source":{"id":"2604.25719","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"eess.AS","submitted_at":"2026-04-28T14:44:30Z","cross_cats_sorted":[],"title_canon_sha256":"e0bdc6f884074d20753f8b342dd28f73a92f301d63a8c1898816f5cd9be6c90a","abstract_canon_sha256":"41843e555f50286fe27243fc731d333ae7cdf2d0f24384c72a773ce7802391c9"},"schema_version":"1.0"},"canonical_sha256":"79d43f46edde834f6d3f0d71bf1177ee54aa2a59807d92332b0dddf6f87c744d","source":{"kind":"arxiv","id":"2604.25719","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.25719","created_at":"2026-06-02T02:04:18Z"},{"alias_kind":"arxiv_version","alias_value":"2604.25719v2","created_at":"2026-06-02T02:04:18Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.25719","created_at":"2026-06-02T02:04:18Z"},{"alias_kind":"pith_short_12","alias_value":"PHKD6RXN32BU","created_at":"2026-06-02T02:04:18Z"},{"alias_kind":"pith_short_16","alias_value":"PHKD6RXN32BU63J7","created_at":"2026-06-02T02:04:18Z"},{"alias_kind":"pith_short_8","alias_value":"PHKD6RXN","created_at":"2026-06-02T02:04:18Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:PHKD6RXN32BU63J7BVY36ELX5Z","target":"record","payload":{"canonical_record":{"source":{"id":"2604.25719","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"eess.AS","submitted_at":"2026-04-28T14:44:30Z","cross_cats_sorted":[],"title_canon_sha256":"e0bdc6f884074d20753f8b342dd28f73a92f301d63a8c1898816f5cd9be6c90a","abstract_canon_sha256":"41843e555f50286fe27243fc731d333ae7cdf2d0f24384c72a773ce7802391c9"},"schema_version":"1.0"},"canonical_sha256":"79d43f46edde834f6d3f0d71bf1177ee54aa2a59807d92332b0dddf6f87c744d","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T02:04:18.241780Z","signature_b64":"M8nmK5RVwGUt5j4ID57BIta4+/8drAZasQS2rdQb7GIAoe4FRIrYSHXLltiyD23eashcZjEZo2aG8T+wLCuvAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"79d43f46edde834f6d3f0d71bf1177ee54aa2a59807d92332b0dddf6f87c744d","last_reissued_at":"2026-06-02T02:04:18.241351Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T02:04:18.241351Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2604.25719","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-02T02:04:18Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Cl5lnOM/I4kukIjBqFSpSlxPfZW8LdukLa/CTYV7M1jc4rqVPOwhv2gRxtVPg0dh5PB0iFuKhG+4H6KB3nihCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T11:58:51.734562Z"},"content_sha256":"78cb852ce063e663352f834d957e3e111adad4bae20ea400d16e76eafa74161b","schema_version":"1.0","event_id":"sha256:78cb852ce063e663352f834d957e3e111adad4bae20ea400d16e76eafa74161b"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:PHKD6RXN32BU63J7BVY36ELX5Z","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Step-Audio-R1.5 Technical Report","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Reinforcement learning from human feedback keeps audio reasoning strong while restoring natural spoken dialogue qualities lost under verified rewards.","cross_cats":[],"primary_cat":"eess.AS","authors_text":"Chengyuan Yao, Daijiao Liu, Daxin Jiang, Eng Siong Chng, Fei Tian, Gang Yu, Haoyang Zhang, Hexin Liu, Jinglan Gong, Jun Chen, Liang Zhao, Qingjian Lin, Xiangyu Tony Zhang, Xiangyu Zhang, Xuerui Yang, Yayue Deng, Yechang Huang, Yuxin Li, Yuxin Zhang","submitted_at":"2026-04-28T14:44:30Z","abstract_excerpt":"Recent advancements in large audio language models have extended Chain-of-Thought (CoT) reasoning into the auditory domain, enabling models to tackle increasingly complex acoustic and spoken tasks. To elicit and sustain these extended reasoning chains, the prevailing paradigm -- driven by the success of text-based reasoning models -- overwhelmingly relies on Reinforcement Learning with Verified Rewards (RLVR). However, as models are strictly optimized to distill rich, continuous auditory contexts into isolated, verifiable text labels, a fundamental question arises: are we fostering true audio "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Step-Audio-R1.5 not only maintains robust analytical reasoning but profoundly transforms the interactive experience, redefining the boundaries of deeply immersive long-turn spoken dialogue.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That RLVR training inherently and systematically degrades prosodic naturalness, emotional continuity, and user immersion in audio models, while RLHF can restore these qualities without new trade-offs.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Step-Audio-R1.5 applies RLHF to audio reasoning models to maintain analytical performance while improving prosodic naturalness and immersion in extended spoken interactions.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Reinforcement learning from human feedback keeps audio reasoning strong while restoring natural spoken dialogue qualities lost under verified rewards.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"a5df00e87dd8141398e2efad9544658e1c86cb04f3a8d16b5f954529ea06a9a6"},"source":{"id":"2604.25719","kind":"arxiv","version":2},"verdict":{"id":"5670f7e0-d088-44d6-a9e8-342db964d1a4","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-07T14:06:38.216830Z","strongest_claim":"Step-Audio-R1.5 not only maintains robust analytical reasoning but profoundly transforms the interactive experience, redefining the boundaries of deeply immersive long-turn spoken dialogue.","one_line_summary":"Step-Audio-R1.5 applies RLHF to audio reasoning models to maintain analytical performance while improving prosodic naturalness and immersion in extended spoken interactions.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That RLVR training inherently and systematically degrades prosodic naturalness, emotional continuity, and user immersion in audio models, while RLHF can restore these qualities without new trade-offs.","pith_extraction_headline":"Reinforcement learning from human feedback keeps audio reasoning strong while restoring natural spoken dialogue qualities lost under verified rewards."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.25719/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"ai_meta_artifact","ran_at":"2026-05-21T04:35:07.197931Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T20:51:25.818833Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"8fbf1403a9a970e4a009ef023eb07fcd9da98b086ba4aa723c93bdaf0aaca46e"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"5670f7e0-d088-44d6-a9e8-342db964d1a4"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-02T02:04:18Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"mvEYvDTEPm3hIijDpxm347lBlNrsfX7zi0NDEH3ITnAnb21MnWT0lgySxd7DrqYNxaint6gSEdatpxRrnRs8AQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T11:58:51.735057Z"},"content_sha256":"f2088d179d05c14b6e37d1212d153dc44820bb0f057af824d681edafbb9d021f","schema_version":"1.0","event_id":"sha256:f2088d179d05c14b6e37d1212d153dc44820bb0f057af824d681edafbb9d021f"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/PHKD6RXN32BU63J7BVY36ELX5Z/bundle.json","state_url":"https://pith.science/pith/PHKD6RXN32BU63J7BVY36ELX5Z/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/PHKD6RXN32BU63J7BVY36ELX5Z/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-05T11:58:51Z","links":{"resolver":"https://pith.science/pith/PHKD6RXN32BU63J7BVY36ELX5Z","bundle":"https://pith.science/pith/PHKD6RXN32BU63J7BVY36ELX5Z/bundle.json","state":"https://pith.science/pith/PHKD6RXN32BU63J7BVY36ELX5Z/state.json","well_known_bundle":"https://pith.science/.well-known/pith/PHKD6RXN32BU63J7BVY36ELX5Z/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:PHKD6RXN32BU63J7BVY36ELX5Z","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"41843e555f50286fe27243fc731d333ae7cdf2d0f24384c72a773ce7802391c9","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"eess.AS","submitted_at":"2026-04-28T14:44:30Z","title_canon_sha256":"e0bdc6f884074d20753f8b342dd28f73a92f301d63a8c1898816f5cd9be6c90a"},"schema_version":"1.0","source":{"id":"2604.25719","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.25719","created_at":"2026-06-02T02:04:18Z"},{"alias_kind":"arxiv_version","alias_value":"2604.25719v2","created_at":"2026-06-02T02:04:18Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.25719","created_at":"2026-06-02T02:04:18Z"},{"alias_kind":"pith_short_12","alias_value":"PHKD6RXN32BU","created_at":"2026-06-02T02:04:18Z"},{"alias_kind":"pith_short_16","alias_value":"PHKD6RXN32BU63J7","created_at":"2026-06-02T02:04:18Z"},{"alias_kind":"pith_short_8","alias_value":"PHKD6RXN","created_at":"2026-06-02T02:04:18Z"}],"graph_snapshots":[{"event_id":"sha256:f2088d179d05c14b6e37d1212d153dc44820bb0f057af824d681edafbb9d021f","target":"graph","created_at":"2026-06-02T02:04:18Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Step-Audio-R1.5 not only maintains robust analytical reasoning but profoundly transforms the interactive experience, redefining the boundaries of deeply immersive long-turn spoken dialogue."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That RLVR training inherently and systematically degrades prosodic naturalness, emotional continuity, and user immersion in audio models, while RLHF can restore these qualities without new trade-offs."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Step-Audio-R1.5 applies RLHF to audio reasoning models to maintain analytical performance while improving prosodic naturalness and immersion in extended spoken interactions."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Reinforcement learning from human feedback keeps audio reasoning strong while restoring natural spoken dialogue qualities lost under verified rewards."}],"snapshot_sha256":"a5df00e87dd8141398e2efad9544658e1c86cb04f3a8d16b5f954529ea06a9a6"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-21T04:35:07.197931Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T20:51:25.818833Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2604.25719/integrity.json","findings":[],"snapshot_sha256":"8fbf1403a9a970e4a009ef023eb07fcd9da98b086ba4aa723c93bdaf0aaca46e","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Recent advancements in large audio language models have extended Chain-of-Thought (CoT) reasoning into the auditory domain, enabling models to tackle increasingly complex acoustic and spoken tasks. To elicit and sustain these extended reasoning chains, the prevailing paradigm -- driven by the success of text-based reasoning models -- overwhelmingly relies on Reinforcement Learning with Verified Rewards (RLVR). However, as models are strictly optimized to distill rich, continuous auditory contexts into isolated, verifiable text labels, a fundamental question arises: are we fostering true audio ","authors_text":"Chengyuan Yao, Daijiao Liu, Daxin Jiang, Eng Siong Chng, Fei Tian, Gang Yu, Haoyang Zhang, Hexin Liu, Jinglan Gong, Jun Chen, Liang Zhao, Qingjian Lin, Xiangyu Tony Zhang, Xiangyu Zhang, Xuerui Yang, Yayue Deng, Yechang Huang, Yuxin Li, Yuxin Zhang","cross_cats":[],"headline":"Reinforcement learning from human feedback keeps audio reasoning strong while restoring natural spoken dialogue qualities lost under verified rewards.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"eess.AS","submitted_at":"2026-04-28T14:44:30Z","title":"Step-Audio-R1.5 Technical Report"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2604.25719","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-07T14:06:38.216830Z","id":"5670f7e0-d088-44d6-a9e8-342db964d1a4","model_set":{"reader":"grok-4.3"},"one_line_summary":"Step-Audio-R1.5 applies RLHF to audio reasoning models to maintain analytical performance while improving prosodic naturalness and immersion in extended spoken interactions.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Reinforcement learning from human feedback keeps audio reasoning strong while restoring natural spoken dialogue qualities lost under verified rewards.","strongest_claim":"Step-Audio-R1.5 not only maintains robust analytical reasoning but profoundly transforms the interactive experience, redefining the boundaries of deeply immersive long-turn spoken dialogue.","weakest_assumption":"That RLVR training inherently and systematically degrades prosodic naturalness, emotional continuity, and user immersion in audio models, while RLHF can restore these qualities without new trade-offs."}},"verdict_id":"5670f7e0-d088-44d6-a9e8-342db964d1a4"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:78cb852ce063e663352f834d957e3e111adad4bae20ea400d16e76eafa74161b","target":"record","created_at":"2026-06-02T02:04:18Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"41843e555f50286fe27243fc731d333ae7cdf2d0f24384c72a773ce7802391c9","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"eess.AS","submitted_at":"2026-04-28T14:44:30Z","title_canon_sha256":"e0bdc6f884074d20753f8b342dd28f73a92f301d63a8c1898816f5cd9be6c90a"},"schema_version":"1.0","source":{"id":"2604.25719","kind":"arxiv","version":2}},"canonical_sha256":"79d43f46edde834f6d3f0d71bf1177ee54aa2a59807d92332b0dddf6f87c744d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"79d43f46edde834f6d3f0d71bf1177ee54aa2a59807d92332b0dddf6f87c744d","first_computed_at":"2026-06-02T02:04:18.241351Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-02T02:04:18.241351Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"M8nmK5RVwGUt5j4ID57BIta4+/8drAZasQS2rdQb7GIAoe4FRIrYSHXLltiyD23eashcZjEZo2aG8T+wLCuvAA==","signature_status":"signed_v1","signed_at":"2026-06-02T02:04:18.241780Z","signed_message":"canonical_sha256_bytes"},"source_id":"2604.25719","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:78cb852ce063e663352f834d957e3e111adad4bae20ea400d16e76eafa74161b","sha256:f2088d179d05c14b6e37d1212d153dc44820bb0f057af824d681edafbb9d021f"],"state_sha256":"4cd1c7ab58bb95b19eae6d968506eb257d407e1b50ecc9607dcd8ef9291555b7"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"I1nvznn6fMtM015sYC29t3LT6XUsgIpJhTbEQc4S2+/FFk1wDAAb2nlIVkmTSltRJNA25Fkf2VbepYBB17COAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-05T11:58:51.737462Z","bundle_sha256":"21166a86a533aa9a59824196af08b97ced8df31d57aa7ed757d037e7239c6faa"}}