{"work":{"id":"5dfb46e7-e952-409d-a3c7-ba7f20aebad6","openalex_id":null,"doi":null,"arxiv_id":"1212.0402","raw_key":null,"title":"UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild","authors":null,"authors_text":"Khurram Soomro, Amir Roshan Zamir, Mubarak Shah","year":2012,"venue":"cs.CV","abstract":"We introduce UCF101 which is currently the largest dataset of human actions. It consists of 101 action classes, over 13k clips and 27 hours of video data. The database consists of realistic user uploaded videos containing camera motion and cluttered background. Additionally, we provide baseline action recognition results on this new dataset using standard bag of words approach with overall performance of 44.5%. To the best of our knowledge, UCF101 is currently the most challenging dataset of actions due to its large number of classes, large number of clips and also unconstrained nature of such clips.","external_url":"https://arxiv.org/abs/1212.0402","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-18T05:00:54.673067+00:00","pith_arxiv_id":"1212.0402","created_at":"2026-05-09T04:17:20.313659+00:00","updated_at":"2026-05-18T05:00:54.673067+00:00","title_quality_ok":true,"display_title":"UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild","render_title":"UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild"},"hub":{"state":{"work_id":"5dfb46e7-e952-409d-a3c7-ba7f20aebad6","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":72,"external_cited_by_count":null,"distinct_field_count":8,"first_pith_cited_at":"2017-05-19T12:07:01+00:00","last_pith_cited_at":"2026-05-13T21:39:50+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-18T05:10:21.962954+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"dataset","n":10},{"context_role":"background","n":2},{"context_role":"baseline","n":2}],"polarity_counts":[{"context_polarity":"use_dataset","n":9},{"context_polarity":"background","n":2},{"context_polarity":"baseline","n":2},{"context_polarity":"unclear","n":1}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T13:31:06.050811+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Fine-Grained Visual Classification of Aircraft","work_id":"ed360110-3ce4-4959-8c74-1785cd9e537d","shared_citers":14},{"title":"The Kinetics Human Action Video Dataset","work_id":"c8a3de61-cfd3-4aeb-bcf7-a0372c015748","shared_citers":14},{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","work_id":"e96730e3-129b-4db6-b981-15ab7932e297","shared_citers":7},{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":6},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":5},{"title":"Hierarchical Text-Conditional Image Generation with CLIP Latents","work_id":"0c6a768b-70b8-4242-bb0e-459f1008c9fc","shared_citers":5},{"title":"HunyuanVideo: A Systematic Framework For Large Video Generative Models","work_id":"881efa7e-7e73-4c66-9cc3-2803e551061c","shared_citers":5},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":4},{"title":"A short note about kinetics-600","work_id":"851b1623-6feb-441e-8849-b07f1753f22e","shared_citers":4},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":4},{"title":"Seedance 1.0: Exploring the Boundaries of Video Generation Models","work_id":"b2e36b5d-99e4-45b4-9358-64f6d3501983","shared_citers":4},{"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","shared_citers":4},{"title":"Towards Accurate Generative Models of Video: A New Metric & Challenges","work_id":"72f42543-17d5-49aa-ba5a-25d67ffbb88a","shared_citers":4},{"title":"arXiv preprint arXiv:2407.02371 (2024)","work_id":"00dd95a8-d503-4a41-9603-785dcf4a0b8e","shared_citers":3},{"title":"Attention Is All You Need","work_id":"baafb5a2-5272-43bc-932f-09fa9ffe5316","shared_citers":3},{"title":"Classifier-Free Diffusion Guidance","work_id":"acf2c588-c088-4a6c-938e-150ad7c666d7","shared_citers":3},{"title":"Coca: Contrastive captioners are image-text foundation models","work_id":"5dd5bf10-d548-40ff-9b6c-6735129b27ee","shared_citers":3},{"title":"CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers","work_id":"2dbd6bcd-fc98-4fbf-b586-f6d94fe1abd2","shared_citers":3},{"title":"Efficient video diffusion models via content-frame motion-latent decomposition","work_id":"d262800e-d0e2-4bf8-ac7e-255fab126f46","shared_citers":3},{"title":"Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs","work_id":"7efbc2dd-b0f2-4f71-bb1c-d2fcf110d805","shared_citers":3},{"title":"Generating Long Sequences with Sparse Transformers","work_id":"c5b81688-45ee-4a9a-b095-e6290f45cb6c","shared_citers":3},{"title":"In: 2023 IEEE/CVF Conference on Com- puter Vision and Pattern Recognition (CVPR)","work_id":"b9701eca-d05e-4d2e-9045-6761df4ba175","shared_citers":3},{"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","shared_citers":3},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":3}],"time_series":[{"n":1,"year":2017},{"n":1,"year":2021},{"n":2,"year":2022},{"n":4,"year":2023},{"n":1,"year":2025},{"n":42,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T13:31:06.071168+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T13:31:13.006854+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild","claims":[{"claim_text":"We introduce UCF101 which is currently the largest dataset of human actions. It consists of 101 action classes, over 13k clips and 27 hours of video data. The database consists of realistic user uploaded videos containing camera motion and cluttered background. Additionally, we provide baseline action recognition results on this new dataset using standard bag of words approach with overall performance of 44.5%. To the best of our knowledge, UCF101 is currently the most challenging dataset of actions due to its large number of classes, large number of clips and also unconstrained nature of such","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T13:31:00.455584+00:00"}},"summary":{"title":"UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild","claims":[{"claim_text":"We introduce UCF101 which is currently the largest dataset of human actions. It consists of 101 action classes, over 13k clips and 27 hours of video data. The database consists of realistic user uploaded videos containing camera motion and cluttered background. Additionally, we provide baseline action recognition results on this new dataset using standard bag of words approach with overall performance of 44.5%. To the best of our knowledge, UCF101 is currently the most challenging dataset of actions due to its large number of classes, large number of clips and also unconstrained nature of such","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Fine-Grained Visual Classification of Aircraft","work_id":"ed360110-3ce4-4959-8c74-1785cd9e537d","shared_citers":14},{"title":"The Kinetics Human Action Video Dataset","work_id":"c8a3de61-cfd3-4aeb-bcf7-a0372c015748","shared_citers":14},{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","work_id":"e96730e3-129b-4db6-b981-15ab7932e297","shared_citers":7},{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":6},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":5},{"title":"Hierarchical Text-Conditional Image Generation with CLIP Latents","work_id":"0c6a768b-70b8-4242-bb0e-459f1008c9fc","shared_citers":5},{"title":"HunyuanVideo: A Systematic Framework For Large Video Generative Models","work_id":"881efa7e-7e73-4c66-9cc3-2803e551061c","shared_citers":5},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":4},{"title":"A short note about kinetics-600","work_id":"851b1623-6feb-441e-8849-b07f1753f22e","shared_citers":4},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":4},{"title":"Seedance 1.0: Exploring the Boundaries of Video Generation Models","work_id":"b2e36b5d-99e4-45b4-9358-64f6d3501983","shared_citers":4},{"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","shared_citers":4},{"title":"Towards Accurate Generative Models of Video: A New Metric & Challenges","work_id":"72f42543-17d5-49aa-ba5a-25d67ffbb88a","shared_citers":4},{"title":"arXiv preprint arXiv:2407.02371 (2024)","work_id":"00dd95a8-d503-4a41-9603-785dcf4a0b8e","shared_citers":3},{"title":"Attention Is All You Need","work_id":"baafb5a2-5272-43bc-932f-09fa9ffe5316","shared_citers":3},{"title":"Classifier-Free Diffusion Guidance","work_id":"acf2c588-c088-4a6c-938e-150ad7c666d7","shared_citers":3},{"title":"Coca: Contrastive captioners are image-text foundation models","work_id":"5dd5bf10-d548-40ff-9b6c-6735129b27ee","shared_citers":3},{"title":"CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers","work_id":"2dbd6bcd-fc98-4fbf-b586-f6d94fe1abd2","shared_citers":3},{"title":"Efficient video diffusion models via content-frame motion-latent decomposition","work_id":"d262800e-d0e2-4bf8-ac7e-255fab126f46","shared_citers":3},{"title":"Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs","work_id":"7efbc2dd-b0f2-4f71-bb1c-d2fcf110d805","shared_citers":3},{"title":"Generating Long Sequences with Sparse Transformers","work_id":"c5b81688-45ee-4a9a-b095-e6290f45cb6c","shared_citers":3},{"title":"In: 2023 IEEE/CVF Conference on Com- puter Vision and Pattern Recognition (CVPR)","work_id":"b9701eca-d05e-4d2e-9045-6761df4ba175","shared_citers":3},{"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","shared_citers":3},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":3}],"time_series":[{"n":1,"year":2017},{"n":1,"year":2021},{"n":2,"year":2022},{"n":4,"year":2023},{"n":1,"year":2025},{"n":42,"year":2026}],"dependency_candidates":[]},"authors":[]}}