{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:PZIWZYZKA3ARYYYBRT2KTJ4UJ2","short_pith_number":"pith:PZIWZYZK","schema_version":"1.0","canonical_sha256":"7e516ce32a06c11c63018cf4a9a7944e868c05a0556bce249fcb5a34ebf25ef5","source":{"kind":"arxiv","id":"1705.07750","version":3},"attestation_state":"computed","paper":{"title":"Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CV","authors_text":"Andrew Zisserman, Joao Carreira","submitted_at":"2017-05-22T13:57:53Z","abstract_excerpt":"The paucity of videos in current action classification datasets (UCF-101 and HMDB-51) has made it difficult to identify good video architectures, as most methods obtain similar performance on existing small-scale benchmarks. This paper re-evaluates state-of-the-art architectures in light of the new Kinetics Human Action Video dataset. Kinetics has two orders of magnitude more data, with 400 human action classes and over 400 clips per class, and is collected from realistic, challenging YouTube videos. We provide an analysis on how current architectures fare on the task of action classification "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1705.07750","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2017-05-22T13:57:53Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"2a1922063541aebfe314cbd45414001a329d862ff2dd1b4e10f7efbf39f3817e","abstract_canon_sha256":"42c7c9b1084e220e9c4c166741c1d83ce0199b62ccdd51594f2261a239dd95b9"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:23:56.547464Z","signature_b64":"GnUyvZgIGmih1HB5ay94yNLKqC4ak32Mi8vbmvMqtuf2Dl8RdUORZEDE7INFbW925nYjphpqrG5FAFzdjPxsDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7e516ce32a06c11c63018cf4a9a7944e868c05a0556bce249fcb5a34ebf25ef5","last_reissued_at":"2026-05-18T00:23:56.546772Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:23:56.546772Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CV","authors_text":"Andrew Zisserman, Joao Carreira","submitted_at":"2017-05-22T13:57:53Z","abstract_excerpt":"The paucity of videos in current action classification datasets (UCF-101 and HMDB-51) has made it difficult to identify good video architectures, as most methods obtain similar performance on existing small-scale benchmarks. This paper re-evaluates state-of-the-art architectures in light of the new Kinetics Human Action Video dataset. Kinetics has two orders of magnitude more data, with 400 human action classes and over 400 clips per class, and is collected from realistic, challenging YouTube videos. We provide an analysis on how current architectures fare on the task of action classification "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1705.07750","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1705.07750","created_at":"2026-05-18T00:23:56.546879+00:00"},{"alias_kind":"arxiv_version","alias_value":"1705.07750v3","created_at":"2026-05-18T00:23:56.546879+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1705.07750","created_at":"2026-05-18T00:23:56.546879+00:00"},{"alias_kind":"pith_short_12","alias_value":"PZIWZYZKA3AR","created_at":"2026-05-18T12:31:37.085036+00:00"},{"alias_kind":"pith_short_16","alias_value":"PZIWZYZKA3ARYYYB","created_at":"2026-05-18T12:31:37.085036+00:00"},{"alias_kind":"pith_short_8","alias_value":"PZIWZYZK","created_at":"2026-05-18T12:31:37.085036+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":4,"sample":[{"citing_arxiv_id":"1907.10473","citing_title":"Switchable Normalization for Learning-to-Normalize Deep Representation","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22200","citing_title":"OSS: Open Suturing Skills Vision-Based Assessment Challenge 2024-2025","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17133","citing_title":"CAM-VFD: Cross-Attention Multimodal Video Forgery Detection","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13202","citing_title":"STAR: Semantic-Temporal Adaptive Representation Learning for Few-Shot Action Recognition","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2604.16987","citing_title":"DVAR: Adversarial Multi-Agent Debate for Video Authenticity Detection","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2605.02094","citing_title":"SignMAE: Segmentation-Driven Self-Supervised Learning for Sign Language Recognition","ref_index":3,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PZIWZYZKA3ARYYYBRT2KTJ4UJ2","json":"https://pith.science/pith/PZIWZYZKA3ARYYYBRT2KTJ4UJ2.json","graph_json":"https://pith.science/api/pith-number/PZIWZYZKA3ARYYYBRT2KTJ4UJ2/graph.json","events_json":"https://pith.science/api/pith-number/PZIWZYZKA3ARYYYBRT2KTJ4UJ2/events.json","paper":"https://pith.science/paper/PZIWZYZK"},"agent_actions":{"view_html":"https://pith.science/pith/PZIWZYZKA3ARYYYBRT2KTJ4UJ2","download_json":"https://pith.science/pith/PZIWZYZKA3ARYYYBRT2KTJ4UJ2.json","view_paper":"https://pith.science/paper/PZIWZYZK","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1705.07750&json=true","fetch_graph":"https://pith.science/api/pith-number/PZIWZYZKA3ARYYYBRT2KTJ4UJ2/graph.json","fetch_events":"https://pith.science/api/pith-number/PZIWZYZKA3ARYYYBRT2KTJ4UJ2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PZIWZYZKA3ARYYYBRT2KTJ4UJ2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PZIWZYZKA3ARYYYBRT2KTJ4UJ2/action/storage_attestation","attest_author":"https://pith.science/pith/PZIWZYZKA3ARYYYBRT2KTJ4UJ2/action/author_attestation","sign_citation":"https://pith.science/pith/PZIWZYZKA3ARYYYBRT2KTJ4UJ2/action/citation_signature","submit_replication":"https://pith.science/pith/PZIWZYZKA3ARYYYBRT2KTJ4UJ2/action/replication_record"}},"created_at":"2026-05-18T00:23:56.546879+00:00","updated_at":"2026-05-18T00:23:56.546879+00:00"}