{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:5G4VZBHC7SYK7CH33HHGU37ECH","short_pith_number":"pith:5G4VZBHC","schema_version":"1.0","canonical_sha256":"e9b95c84e2fcb0af88fbd9ce6a6fe411e2760f48503d555f1822b9e806296f2a","source":{"kind":"arxiv","id":"1809.01696","version":2},"attestation_state":"computed","paper":{"title":"TVQA: Localized, Compositional Video Question Answering","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CV"],"primary_cat":"cs.CL","authors_text":"Jie Lei, Licheng Yu, Mohit Bansal, Tamara L. Berg","submitted_at":"2018-09-05T19:14:11Z","abstract_excerpt":"Recent years have witnessed an increasing interest in image-based question-answering (QA) tasks. However, due to data limitations, there has been much less work on video-based QA. In this paper, we present TVQA, a large-scale video QA dataset based on 6 popular TV shows. TVQA consists of 152,545 QA pairs from 21,793 clips, spanning over 460 hours of video. Questions are designed to be compositional in nature, requiring systems to jointly localize relevant moments within a clip, comprehend subtitle-based dialogue, and recognize relevant visual concepts. We provide analyses of this new dataset a"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1809.01696","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-09-05T19:14:11Z","cross_cats_sorted":["cs.AI","cs.CV"],"title_canon_sha256":"200971c648ff0c32c6d6b570bd954ee8915aa698be2023b9f608a29e039ab324","abstract_canon_sha256":"261930cb22d04a7cf05e0ebe660367d56f27d25ee2fdbfba8a75cb144cf5d72e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:46:46.188657Z","signature_b64":"/zEK60mvCcyyU2mBQZUcWadsIjpMsU00VZeEWcHmUGM0XocBU0IcrQYuFOWZlM+ibDb3x5RSjbVOtA2ToamGDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e9b95c84e2fcb0af88fbd9ce6a6fe411e2760f48503d555f1822b9e806296f2a","last_reissued_at":"2026-05-17T23:46:46.188221Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:46:46.188221Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"TVQA: Localized, Compositional Video Question Answering","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CV"],"primary_cat":"cs.CL","authors_text":"Jie Lei, Licheng Yu, Mohit Bansal, Tamara L. Berg","submitted_at":"2018-09-05T19:14:11Z","abstract_excerpt":"Recent years have witnessed an increasing interest in image-based question-answering (QA) tasks. However, due to data limitations, there has been much less work on video-based QA. In this paper, we present TVQA, a large-scale video QA dataset based on 6 popular TV shows. TVQA consists of 152,545 QA pairs from 21,793 clips, spanning over 460 hours of video. Questions are designed to be compositional in nature, requiring systems to jointly localize relevant moments within a clip, comprehend subtitle-based dialogue, and recognize relevant visual concepts. We provide analyses of this new dataset a"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1809.01696","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1809.01696","created_at":"2026-05-17T23:46:46.188284+00:00"},{"alias_kind":"arxiv_version","alias_value":"1809.01696v2","created_at":"2026-05-17T23:46:46.188284+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1809.01696","created_at":"2026-05-17T23:46:46.188284+00:00"},{"alias_kind":"pith_short_12","alias_value":"5G4VZBHC7SYK","created_at":"2026-05-18T12:32:08.215937+00:00"},{"alias_kind":"pith_short_16","alias_value":"5G4VZBHC7SYK7CH3","created_at":"2026-05-18T12:32:08.215937+00:00"},{"alias_kind":"pith_short_8","alias_value":"5G4VZBHC","created_at":"2026-05-18T12:32:08.215937+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":7,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"2312.02549","citing_title":"DemaFormer: Damped Exponential Moving Average Transformer with Energy-Based Modeling for Temporal Language Grounding","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2303.16199","citing_title":"LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention","ref_index":137,"is_internal_anchor":true},{"citing_arxiv_id":"2406.04264","citing_title":"MLVU: Benchmarking Multi-task Long Video Understanding","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22409","citing_title":"SpaMEM: Benchmarking Dynamic Spatial Reasoning via Perception-Memory Integration in Embodied Environments","ref_index":14,"is_internal_anchor":false},{"citing_arxiv_id":"2604.21766","citing_title":"AUDITA: A New Dataset to Audit Humans vs. AI Skill at Audio QA","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2410.02713","citing_title":"LLaVA-Video: Video Instruction Tuning With Synthetic Data","ref_index":179,"is_internal_anchor":false},{"citing_arxiv_id":"2604.14149","citing_title":"One Token per Highly Selective Frame: Towards Extreme Compression for Long Video Understanding","ref_index":30,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/5G4VZBHC7SYK7CH33HHGU37ECH","json":"https://pith.science/pith/5G4VZBHC7SYK7CH33HHGU37ECH.json","graph_json":"https://pith.science/api/pith-number/5G4VZBHC7SYK7CH33HHGU37ECH/graph.json","events_json":"https://pith.science/api/pith-number/5G4VZBHC7SYK7CH33HHGU37ECH/events.json","paper":"https://pith.science/paper/5G4VZBHC"},"agent_actions":{"view_html":"https://pith.science/pith/5G4VZBHC7SYK7CH33HHGU37ECH","download_json":"https://pith.science/pith/5G4VZBHC7SYK7CH33HHGU37ECH.json","view_paper":"https://pith.science/paper/5G4VZBHC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1809.01696&json=true","fetch_graph":"https://pith.science/api/pith-number/5G4VZBHC7SYK7CH33HHGU37ECH/graph.json","fetch_events":"https://pith.science/api/pith-number/5G4VZBHC7SYK7CH33HHGU37ECH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/5G4VZBHC7SYK7CH33HHGU37ECH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/5G4VZBHC7SYK7CH33HHGU37ECH/action/storage_attestation","attest_author":"https://pith.science/pith/5G4VZBHC7SYK7CH33HHGU37ECH/action/author_attestation","sign_citation":"https://pith.science/pith/5G4VZBHC7SYK7CH33HHGU37ECH/action/citation_signature","submit_replication":"https://pith.science/pith/5G4VZBHC7SYK7CH33HHGU37ECH/action/replication_record"}},"created_at":"2026-05-17T23:46:46.188284+00:00","updated_at":"2026-05-17T23:46:46.188284+00:00"}