{"work":{"id":"074eb9b5-e9c6-4075-8dcb-0e4c103924c1","openalex_id":null,"doi":null,"arxiv_id":"1903.00161","raw_key":null,"title":"DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs","authors":null,"authors_text":"Dua, D","year":2019,"venue":"cs.CL","abstract":"Reading comprehension has recently seen rapid progress, with systems matching humans on the most popular datasets for the task. However, a large body of work has highlighted the brittleness of these systems, showing that there is much work left to be done. We introduce a new English reading comprehension benchmark, DROP, which requires Discrete Reasoning Over the content of Paragraphs. In this crowdsourced, adversarially-created, 96k-question benchmark, a system must resolve references in a question, perhaps to multiple input positions, and perform discrete operations over them (such as addition, counting, or sorting). These operations require a much more comprehensive understanding of the content of paragraphs than what was necessary for prior datasets. We apply state-of-the-art methods from both the reading comprehension and semantic parsing literature on this dataset and show that the best systems only achieve 32.7% F1 on our generalized accuracy metric, while expert human performance is 96.0%. We additionally present a new model that combines reading comprehension methods with simple numerical reasoning to achieve 47.0% F1.","external_url":"https://arxiv.org/abs/1903.00161","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-22T18:36:58.720593+00:00","pith_arxiv_id":"1903.00161","created_at":"2026-05-09T06:20:42.395526+00:00","updated_at":"2026-05-22T18:36:58.720593+00:00","title_quality_ok":true,"display_title":"Drop: A reading comprehension benchmark requiring discrete reasoning over paragraphs","render_title":"Drop: A reading comprehension benchmark requiring discrete reasoning over paragraphs"},"hub":{"state":{"work_id":"074eb9b5-e9c6-4075-8dcb-0e4c103924c1","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":25,"external_cited_by_count":null,"distinct_field_count":6,"first_pith_cited_at":"2020-02-10T18:55:58+00:00","last_pith_cited_at":"2026-05-18T08:18:22+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-05T05:08:28.702936+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":4},{"context_role":"dataset","n":3},{"context_role":"baseline","n":1}],"polarity_counts":[{"context_polarity":"background","n":5},{"context_polarity":"use_dataset","n":2},{"context_polarity":"baseline","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}