{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:7IK6ILWQAPXKMPENTXFUTGBANA","short_pith_number":"pith:7IK6ILWQ","schema_version":"1.0","canonical_sha256":"fa15e42ed003eea63c8d9dcb49982068273784102384f097c27a1235d436fc3c","source":{"kind":"arxiv","id":"2605.15104","version":1},"attestation_state":"computed","paper":{"title":"From Text to Voice: A Reproducible and Verifiable Framework for Evaluating Tool Calling LLM Agents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A dataset-agnostic framework converts text tool-calling benchmarks to paired audio versions via TTS and noise, showing model-dependent performance with small text-to-voice gaps of 1.8-4.8 points on Confetti and When2Call.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Jonas Robertson, Md Tahmid Rahman Laskar, Quinten McNamara, Seyyed Saeed Sarfjoo, Shashi Bhushan TN, Xue-Yong Fu","submitted_at":"2026-05-14T17:22:42Z","abstract_excerpt":"Voice agents increasingly require reliable tool use from speech, whereas prominent tool-calling benchmarks remain text-based. We study whether verified text benchmarks can be converted into controlled audio-based tool calling evaluations without re-annotating the tool schema and gold labels. Our dataset-agnostic framework uses text-to-speech, speaker variation, and environmental noise to create paired text-audio instances while preserving the original dataset annotations. Based on extensive evaluation of 7 omni-modal models on audio-converted versions of Confetti and When2Call, our framework d"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.15104","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T17:22:42Z","cross_cats_sorted":[],"title_canon_sha256":"8e5b9e5718a5ec88afa0d84104e3d88ac3579debb8b5b46b0b714e9896a94ae2","abstract_canon_sha256":"0ca004fe3ea5eccd10deb2c2fa2bcf5406961f23c64776f168d631d878553e81"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.2","canonical_sha256":"fa15e42ed003eea63c8d9dcb49982068273784102384f097c27a1235d436fc3c","last_reissued_at":"2026-05-17T21:57:19.134793Z","signature_status":"unsigned_v0","first_computed_at":"2026-05-17T21:40:25.803035Z"},"graph_snapshot":{"paper":{"title":"From Text to Voice: A Reproducible and Verifiable Framework for Evaluating Tool Calling LLM Agents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A dataset-agnostic framework converts text tool-calling benchmarks to paired audio versions via TTS and noise, showing model-dependent performance with small text-to-voice gaps of 1.8-4.8 points on Confetti and When2Call.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Jonas Robertson, Md Tahmid Rahman Laskar, Quinten McNamara, Seyyed Saeed Sarfjoo, Shashi Bhushan TN, Xue-Yong Fu","submitted_at":"2026-05-14T17:22:42Z","abstract_excerpt":"Voice agents increasingly require reliable tool use from speech, whereas prominent tool-calling benchmarks remain text-based. We study whether verified text benchmarks can be converted into controlled audio-based tool calling evaluations without re-annotating the tool schema and gold labels. Our dataset-agnostic framework uses text-to-speech, speaker variation, and environmental noise to create paired text-audio instances while preserving the original dataset annotations. Based on extensive evaluation of 7 omni-modal models on audio-converted versions of Confetti and When2Call, our framework d"},"claims":{"count":3,"items":[{"kind":"strongest_claim","text":"Our dataset-agnostic framework uses text-to-speech, speaker variation, and environmental noise to create paired text-audio instances while preserving the original dataset annotations.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That adding TTS, speaker variation, and environmental noise does not introduce new biases or artifacts that change how models interpret tool arguments or intent in ways that the preserved gold labels fail to capture.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"A dataset-agnostic framework converts text tool-calling benchmarks to paired audio versions via TTS and noise, showing model-dependent performance with small text-to-voice gaps of 1.8-4.8 points on Confetti and When2Call.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"}],"snapshot_sha256":"07177578b3589c9d3bc869a279e05849da3fd4eb1cc7a91089b853ada5d09583"},"source":{"id":"2605.15104","kind":"arxiv","version":1},"verdict":{"id":"ac5245d5-a3a5-4069-9e23-d2c14e55e689","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T03:18:59.044985Z","strongest_claim":"Our dataset-agnostic framework uses text-to-speech, speaker variation, and environmental noise to create paired text-audio instances while preserving the original dataset annotations.","one_line_summary":"A dataset-agnostic framework converts text tool-calling benchmarks to paired audio versions via TTS and noise, showing model-dependent performance with small text-to-voice gaps of 1.8-4.8 points on Confetti and When2Call.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That adding TTS, speaker variation, and environmental noise does not introduce new biases or artifacts that change how models interpret tool arguments or intent in ways that the preserved gold labels fail to capture.","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.15104","created_at":"2026-05-17T21:18:33.361462+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.15104v1","created_at":"2026-05-17T21:18:33.361462+00:00"},{"alias_kind":"pith_short_12","alias_value":"7IK6ILWQAPXK","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"7IK6ILWQAPXKMPEN","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"7IK6ILWQ","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7IK6ILWQAPXKMPENTXFUTGBANA","json":"https://pith.science/pith/7IK6ILWQAPXKMPENTXFUTGBANA.json","graph_json":"https://pith.science/api/pith-number/7IK6ILWQAPXKMPENTXFUTGBANA/graph.json","events_json":"https://pith.science/api/pith-number/7IK6ILWQAPXKMPENTXFUTGBANA/events.json","paper":"https://pith.science/paper/7IK6ILWQ"},"agent_actions":{"view_html":"https://pith.science/pith/7IK6ILWQAPXKMPENTXFUTGBANA","download_json":"https://pith.science/pith/7IK6ILWQAPXKMPENTXFUTGBANA.json","view_paper":"https://pith.science/paper/7IK6ILWQ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.15104&json=true","fetch_graph":"https://pith.science/api/pith-number/7IK6ILWQAPXKMPENTXFUTGBANA/graph.json","fetch_events":"https://pith.science/api/pith-number/7IK6ILWQAPXKMPENTXFUTGBANA/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7IK6ILWQAPXKMPENTXFUTGBANA/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7IK6ILWQAPXKMPENTXFUTGBANA/action/storage_attestation","attest_author":"https://pith.science/pith/7IK6ILWQAPXKMPENTXFUTGBANA/action/author_attestation","sign_citation":"https://pith.science/pith/7IK6ILWQAPXKMPENTXFUTGBANA/action/citation_signature","submit_replication":"https://pith.science/pith/7IK6ILWQAPXKMPENTXFUTGBANA/action/replication_record"}},"created_at":"2026-05-17T21:18:33.361462+00:00","updated_at":"2026-05-17T21:57:19.134864+00:00"}