{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:UOVSYDGJM7RKTZJJQPYJQV2R45","short_pith_number":"pith:UOVSYDGJ","schema_version":"1.0","canonical_sha256":"a3ab2c0cc967e2a9e52983f0985751e743077965e7228dd978cc60f5a0a91cf6","source":{"kind":"arxiv","id":"1808.01340","version":1},"attestation_state":"computed","paper":{"title":"A Short Note about Kinetics-600","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Andras Banki-Horvath, Andrew Zisserman, Chloe Hillier, Eric Noland, Joao Carreira","submitted_at":"2018-08-03T20:17:05Z","abstract_excerpt":"We describe an extension of the DeepMind Kinetics human action dataset from 400 classes, each with at least 400 video clips, to 600 classes, each with at least 600 video clips. In order to scale up the dataset we changed the data collection process so it uses multiple queries per class, with some of them in a language other than english -- portuguese. This paper details the changes between the two versions of the dataset and includes a comprehensive set of statistics of the new version as well as baseline results using the I3D neural network architecture. The paper is a companion to the releas"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1808.01340","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2018-08-03T20:17:05Z","cross_cats_sorted":[],"title_canon_sha256":"58ddbfb2eafba1d72a56754207d44c434e2b21f525e3a1e25aedb4c565d2c1c4","abstract_canon_sha256":"33262dde5e20b3dca34f60b50a4ab2af03e218c1ac6abf2dfc40d6285cd526d7"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:08:53.329881Z","signature_b64":"z5XZDuggQsmxoW0ml9Fm0rMifbnUUgXmY/83Vo7SaQdMgX+4kPM5xjBPFSYWfmT48GBP1kbtLau0POn8G1NgDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a3ab2c0cc967e2a9e52983f0985751e743077965e7228dd978cc60f5a0a91cf6","last_reissued_at":"2026-05-18T00:08:53.329222Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:08:53.329222Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"A Short Note about Kinetics-600","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Andras Banki-Horvath, Andrew Zisserman, Chloe Hillier, Eric Noland, Joao Carreira","submitted_at":"2018-08-03T20:17:05Z","abstract_excerpt":"We describe an extension of the DeepMind Kinetics human action dataset from 400 classes, each with at least 400 video clips, to 600 classes, each with at least 600 video clips. In order to scale up the dataset we changed the data collection process so it uses multiple queries per class, with some of them in a language other than english -- portuguese. This paper details the changes between the two versions of the dataset and includes a comprehensive set of statistics of the new version as well as baseline results using the I3D neural network architecture. The paper is a companion to the releas"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1808.01340","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1808.01340","created_at":"2026-05-18T00:08:53.329306+00:00"},{"alias_kind":"arxiv_version","alias_value":"1808.01340v1","created_at":"2026-05-18T00:08:53.329306+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1808.01340","created_at":"2026-05-18T00:08:53.329306+00:00"},{"alias_kind":"pith_short_12","alias_value":"UOVSYDGJM7RK","created_at":"2026-05-18T12:32:56.356000+00:00"},{"alias_kind":"pith_short_16","alias_value":"UOVSYDGJM7RKTZJJ","created_at":"2026-05-18T12:32:56.356000+00:00"},{"alias_kind":"pith_short_8","alias_value":"UOVSYDGJ","created_at":"2026-05-18T12:32:56.356000+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":22,"internal_anchor_count":10,"sample":[{"citing_arxiv_id":"2605.23288","citing_title":"Spatio-Temporal Similarity Volume Aggregation for Open-Vocabulary Action Recognition","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2406.05615","citing_title":"Video-Language Understanding: A Survey from Model Architecture, Model Training, and Data Perspectives","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2411.17690","citing_title":"Mechanisms of Multimodal Synchronization: Insights from Decoder-Based Video-Text-to-Speech Synthesis","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2412.11149","citing_title":"A Comprehensive Survey of Action Quality Assessment: Method and Benchmark","ref_index":107,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22819","citing_title":"Cambrian-P: Pose-Grounded Video Understanding","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20838","citing_title":"USV: Towards Understanding the User-generated Short-form Videos","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"1911.11641","citing_title":"PIQA: Reasoning about Physical Commonsense in Natural Language","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2212.03191","citing_title":"InternVideo: General Video Foundation Models via Generative and Discriminative Learning","ref_index":69,"is_internal_anchor":true},{"citing_arxiv_id":"2312.14125","citing_title":"VideoPoet: A Large Language Model for Zero-Shot Video Generation","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2205.01917","citing_title":"CoCa: Contrastive Captioners are Image-Text Foundation Models","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2312.14238","citing_title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks","ref_index":19,"is_internal_anchor":false},{"citing_arxiv_id":"2310.05737","citing_title":"Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation","ref_index":266,"is_internal_anchor":false},{"citing_arxiv_id":"2204.03458","citing_title":"Video Diffusion Models","ref_index":9,"is_internal_anchor":false},{"citing_arxiv_id":"2303.15389","citing_title":"EVA-CLIP: Improved Training Techniques for CLIP at Scale","ref_index":7,"is_internal_anchor":false},{"citing_arxiv_id":"2605.00434","citing_title":"LIMSSR: LLM-Driven Sequence-to-Score Reasoning under Training-Time Incomplete Multimodal Observations","ref_index":65,"is_internal_anchor":false},{"citing_arxiv_id":"2205.15868","citing_title":"CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers","ref_index":3,"is_internal_anchor":false},{"citing_arxiv_id":"2604.08050","citing_title":"ABMAMBA: Multimodal Large Language Model with Aligned Hierarchical Bidirectional Scan for Efficient Video Captioning","ref_index":8,"is_internal_anchor":false},{"citing_arxiv_id":"2605.07859","citing_title":"EyeCue: Driver Cognitive Distraction Detection via Gaze-Empowered Egocentric Video Understanding","ref_index":37,"is_internal_anchor":false},{"citing_arxiv_id":"2506.09985","citing_title":"V-JEPA 2: Self-Supervised Video Models Enable Understanding, Prediction and Planning","ref_index":10,"is_internal_anchor":false},{"citing_arxiv_id":"2604.13667","citing_title":"From Pixels to Nucleotides: End-to-End Token-Based Video Compression for DNA Storage","ref_index":47,"is_internal_anchor":false},{"citing_arxiv_id":"2604.18367","citing_title":"EAST: Early Action Prediction Sampling Strategy with Token Masking","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2604.17062","citing_title":"Motion-Guided Semantic Alignment with Negative Prompts for Zero-Shot Video Action Recognition","ref_index":29,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/UOVSYDGJM7RKTZJJQPYJQV2R45","json":"https://pith.science/pith/UOVSYDGJM7RKTZJJQPYJQV2R45.json","graph_json":"https://pith.science/api/pith-number/UOVSYDGJM7RKTZJJQPYJQV2R45/graph.json","events_json":"https://pith.science/api/pith-number/UOVSYDGJM7RKTZJJQPYJQV2R45/events.json","paper":"https://pith.science/paper/UOVSYDGJ"},"agent_actions":{"view_html":"https://pith.science/pith/UOVSYDGJM7RKTZJJQPYJQV2R45","download_json":"https://pith.science/pith/UOVSYDGJM7RKTZJJQPYJQV2R45.json","view_paper":"https://pith.science/paper/UOVSYDGJ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1808.01340&json=true","fetch_graph":"https://pith.science/api/pith-number/UOVSYDGJM7RKTZJJQPYJQV2R45/graph.json","fetch_events":"https://pith.science/api/pith-number/UOVSYDGJM7RKTZJJQPYJQV2R45/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/UOVSYDGJM7RKTZJJQPYJQV2R45/action/timestamp_anchor","attest_storage":"https://pith.science/pith/UOVSYDGJM7RKTZJJQPYJQV2R45/action/storage_attestation","attest_author":"https://pith.science/pith/UOVSYDGJM7RKTZJJQPYJQV2R45/action/author_attestation","sign_citation":"https://pith.science/pith/UOVSYDGJM7RKTZJJQPYJQV2R45/action/citation_signature","submit_replication":"https://pith.science/pith/UOVSYDGJM7RKTZJJQPYJQV2R45/action/replication_record"}},"created_at":"2026-05-18T00:08:53.329306+00:00","updated_at":"2026-05-18T00:08:53.329306+00:00"}