{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2022:76PIDF562LPEAVOQA5XTPJIN5C","short_pith_number":"pith:76PIDF56","schema_version":"1.0","canonical_sha256":"ff9e8197bed2de4055d0076f37a50de8bb9d4833e58738e375919cac11a75243","source":{"kind":"arxiv","id":"2204.09634","version":2},"attestation_state":"computed","paper":{"title":"Clotho-AQA: A Crowdsourced Dataset for Audio Question Answering","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.LG","eess.AS"],"primary_cat":"cs.SD","authors_text":"Konstantinos Drossos, Parthasaarathy Sudarsanam, Samuel Lipping, Tuomas Virtanen","submitted_at":"2022-04-20T17:28:53Z","abstract_excerpt":"Audio question answering (AQA) is a multimodal translation task where a system analyzes an audio signal and a natural language question, to generate a desirable natural language answer. In this paper, we introduce Clotho-AQA, a dataset for Audio question answering consisting of 1991 audio files each between 15 to 30 seconds in duration selected from the Clotho dataset. For each audio file, we collect six different questions and corresponding answers by crowdsourcing using Amazon Mechanical Turk. The questions and answers are produced by different annotators. Out of the six questions for each a"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2204.09634","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.SD","submitted_at":"2022-04-20T17:28:53Z","cross_cats_sorted":["cs.LG","eess.AS"],"title_canon_sha256":"d086f0a459a7c872d7266c3a1edc698ccb3003f7028a6dbbd8c6dae7d9ecc7d0","abstract_canon_sha256":"add8327e567f6e2b32bffa8879ed098bcb6d7687a07b58f18f8464f247fa67fc"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T04:32:40.415069Z","signature_b64":"xKhGTIBb38tCe86iTNsaD7GpIySdPgBFUKo14D/UXe4ofmrM9vDSYVr9FArsW/Zyvd/eXoMWeKVILpM9StyhCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ff9e8197bed2de4055d0076f37a50de8bb9d4833e58738e375919cac11a75243","last_reissued_at":"2026-07-05T04:32:40.414588Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T04:32:40.414588Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Clotho-AQA: A Crowdsourced Dataset for Audio Question Answering","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.LG","eess.AS"],"primary_cat":"cs.SD","authors_text":"Konstantinos Drossos, Parthasaarathy Sudarsanam, Samuel Lipping, Tuomas Virtanen","submitted_at":"2022-04-20T17:28:53Z","abstract_excerpt":"Audio question answering (AQA) is a multimodal translation task where a system analyzes an audio signal and a natural language question, to generate a desirable natural language answer. In this paper, we introduce Clotho-AQA, a dataset for Audio question answering consisting of 1991 audio files each between 15 to 30 seconds in duration selected from the Clotho dataset. For each audio file, we collect six different questions and corresponding answers by crowdsourcing using Amazon Mechanical Turk. The questions and answers are produced by different annotators. Out of the six questions for each a"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2204.09634","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2204.09634/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2204.09634","created_at":"2026-07-05T04:32:40.414649+00:00"},{"alias_kind":"arxiv_version","alias_value":"2204.09634v2","created_at":"2026-07-05T04:32:40.414649+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2204.09634","created_at":"2026-07-05T04:32:40.414649+00:00"},{"alias_kind":"pith_short_12","alias_value":"76PIDF562LPE","created_at":"2026-07-05T04:32:40.414649+00:00"},{"alias_kind":"pith_short_16","alias_value":"76PIDF562LPEAVOQ","created_at":"2026-07-05T04:32:40.414649+00:00"},{"alias_kind":"pith_short_8","alias_value":"76PIDF56","created_at":"2026-07-05T04:32:40.414649+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2607.00247","citing_title":"Adaptive Perturbation Selection for Contrastive Audio Decoding","ref_index":17,"is_internal_anchor":false},{"citing_arxiv_id":"2505.22765","citing_title":"StressTest: Can YOUR Speech LM Handle the Stress?","ref_index":2,"is_internal_anchor":false},{"citing_arxiv_id":"2604.21766","citing_title":"AUDITA: A New Dataset to Audit Humans vs. AI Skill at Audio QA","ref_index":6,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/76PIDF562LPEAVOQA5XTPJIN5C","json":"https://pith.science/pith/76PIDF562LPEAVOQA5XTPJIN5C.json","graph_json":"https://pith.science/api/pith-number/76PIDF562LPEAVOQA5XTPJIN5C/graph.json","events_json":"https://pith.science/api/pith-number/76PIDF562LPEAVOQA5XTPJIN5C/events.json","paper":"https://pith.science/paper/76PIDF56"},"agent_actions":{"view_html":"https://pith.science/pith/76PIDF562LPEAVOQA5XTPJIN5C","download_json":"https://pith.science/pith/76PIDF562LPEAVOQA5XTPJIN5C.json","view_paper":"https://pith.science/paper/76PIDF56","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2204.09634&json=true","fetch_graph":"https://pith.science/api/pith-number/76PIDF562LPEAVOQA5XTPJIN5C/graph.json","fetch_events":"https://pith.science/api/pith-number/76PIDF562LPEAVOQA5XTPJIN5C/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/76PIDF562LPEAVOQA5XTPJIN5C/action/timestamp_anchor","attest_storage":"https://pith.science/pith/76PIDF562LPEAVOQA5XTPJIN5C/action/storage_attestation","attest_author":"https://pith.science/pith/76PIDF562LPEAVOQA5XTPJIN5C/action/author_attestation","sign_citation":"https://pith.science/pith/76PIDF562LPEAVOQA5XTPJIN5C/action/citation_signature","submit_replication":"https://pith.science/pith/76PIDF562LPEAVOQA5XTPJIN5C/action/replication_record"}},"created_at":"2026-07-05T04:32:40.414649+00:00","updated_at":"2026-07-05T04:32:40.414649+00:00"}