{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:MPMEQCWRKP7IVPRQZYJZWOFPHS","short_pith_number":"pith:MPMEQCWR","schema_version":"1.0","canonical_sha256":"63d8480ad153fe8abe30ce139b38af3c9b68cedc944e717f4227d1bf6dbfdd04","source":{"kind":"arxiv","id":"2601.06329","version":2},"attestation_state":"computed","paper":{"title":"On the Fallacy of Global Token Perplexity in Spoken Language Model Evaluation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Carlos Busso, Chan-Jan Hsu, Hung-yi Lee, Ju-chieh Chou, Kai-Wei Chang, Liang-Hsuan Tseng, Yen-Chun Kuo, Yi-Cheng Lin","submitted_at":"2026-01-09T22:01:56Z","abstract_excerpt":"Generative spoken language models pretrained on large-scale raw audio can continue a speech prompt with appropriate content while preserving attributes like speaker and emotion, serving as foundation models for spoken dialogue. In prior literature, these models are often evaluated using ``global token perplexity'', which directly applies the text perplexity formulation to speech tokens. However, this practice overlooks fundamental differences between speech and text modalities, possibly leading to an underestimation of the speech characteristics. In this work, we propose a variety of likelihoo"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2601.06329","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-09T22:01:56Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"abdd9784a0871093982e1f0ee934e73f8514e5be2b6aadb45b0f819d5ba00394","abstract_canon_sha256":"ab3c016d97694fce01cd7fb57a4c15f9c1d9b0b37c7bade7344ba386663237d2"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-28T01:04:35.195137Z","signature_b64":"DbmachQYOvlit1THNy7xTC7130/7JoQxX/4BVU+VWyk/jq0tSfuCjqQu4t+GVgzv7sZvzNt7OZmHgWDgycSTCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"63d8480ad153fe8abe30ce139b38af3c9b68cedc944e717f4227d1bf6dbfdd04","last_reissued_at":"2026-05-28T01:04:35.194584Z","signature_status":"signed_v1","first_computed_at":"2026-05-28T01:04:35.194584Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"On the Fallacy of Global Token Perplexity in Spoken Language Model Evaluation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Carlos Busso, Chan-Jan Hsu, Hung-yi Lee, Ju-chieh Chou, Kai-Wei Chang, Liang-Hsuan Tseng, Yen-Chun Kuo, Yi-Cheng Lin","submitted_at":"2026-01-09T22:01:56Z","abstract_excerpt":"Generative spoken language models pretrained on large-scale raw audio can continue a speech prompt with appropriate content while preserving attributes like speaker and emotion, serving as foundation models for spoken dialogue. In prior literature, these models are often evaluated using ``global token perplexity'', which directly applies the text perplexity formulation to speech tokens. However, this practice overlooks fundamental differences between speech and text modalities, possibly leading to an underestimation of the speech characteristics. In this work, we propose a variety of likelihoo"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.06329","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.06329/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2601.06329","created_at":"2026-05-28T01:04:35.194642+00:00"},{"alias_kind":"arxiv_version","alias_value":"2601.06329v2","created_at":"2026-05-28T01:04:35.194642+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.06329","created_at":"2026-05-28T01:04:35.194642+00:00"},{"alias_kind":"pith_short_12","alias_value":"MPMEQCWRKP7I","created_at":"2026-05-28T01:04:35.194642+00:00"},{"alias_kind":"pith_short_16","alias_value":"MPMEQCWRKP7IVPRQ","created_at":"2026-05-28T01:04:35.194642+00:00"},{"alias_kind":"pith_short_8","alias_value":"MPMEQCWR","created_at":"2026-05-28T01:04:35.194642+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MPMEQCWRKP7IVPRQZYJZWOFPHS","json":"https://pith.science/pith/MPMEQCWRKP7IVPRQZYJZWOFPHS.json","graph_json":"https://pith.science/api/pith-number/MPMEQCWRKP7IVPRQZYJZWOFPHS/graph.json","events_json":"https://pith.science/api/pith-number/MPMEQCWRKP7IVPRQZYJZWOFPHS/events.json","paper":"https://pith.science/paper/MPMEQCWR"},"agent_actions":{"view_html":"https://pith.science/pith/MPMEQCWRKP7IVPRQZYJZWOFPHS","download_json":"https://pith.science/pith/MPMEQCWRKP7IVPRQZYJZWOFPHS.json","view_paper":"https://pith.science/paper/MPMEQCWR","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2601.06329&json=true","fetch_graph":"https://pith.science/api/pith-number/MPMEQCWRKP7IVPRQZYJZWOFPHS/graph.json","fetch_events":"https://pith.science/api/pith-number/MPMEQCWRKP7IVPRQZYJZWOFPHS/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MPMEQCWRKP7IVPRQZYJZWOFPHS/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MPMEQCWRKP7IVPRQZYJZWOFPHS/action/storage_attestation","attest_author":"https://pith.science/pith/MPMEQCWRKP7IVPRQZYJZWOFPHS/action/author_attestation","sign_citation":"https://pith.science/pith/MPMEQCWRKP7IVPRQZYJZWOFPHS/action/citation_signature","submit_replication":"https://pith.science/pith/MPMEQCWRKP7IVPRQZYJZWOFPHS/action/replication_record"}},"created_at":"2026-05-28T01:04:35.194642+00:00","updated_at":"2026-05-28T01:04:35.194642+00:00"}