{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2014:M3ZEIEXKKQS6VOSESGKGYWEIN4","short_pith_number":"pith:M3ZEIEXK","schema_version":"1.0","canonical_sha256":"66f24412ea5425eaba4491946c58886f39d058d1a029743f2c43bc0d52e805c7","source":{"kind":"arxiv","id":"1406.2199","version":2},"attestation_state":"computed","paper":{"title":"Two-Stream Convolutional Networks for Action Recognition in Videos","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Andrew Zisserman, Karen Simonyan","submitted_at":"2014-06-09T14:44:14Z","abstract_excerpt":"We investigate architectures of discriminatively trained deep Convolutional Networks (ConvNets) for action recognition in video. The challenge is to capture the complementary information on appearance from still frames and motion between frames. We also aim to generalise the best performing hand-crafted features within a data-driven learning framework.\n  Our contribution is three-fold. First, we propose a two-stream ConvNet architecture which incorporates spatial and temporal networks. Second, we demonstrate that a ConvNet trained on multi-frame dense optical flow is able to achieve very good "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1406.2199","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2014-06-09T14:44:14Z","cross_cats_sorted":[],"title_canon_sha256":"2d0ca9c9242bbd5797320a538090fcb86aeeb22876119404448652b7022fc7ab","abstract_canon_sha256":"dcf0b36b50d7ad95db13c6674ab2ec83e22af8bdea1d79deaece30c06bb259e3"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:37:51.702258Z","signature_b64":"u13MyQfHBJy+f368Ra90QKiPQ5H5VhPl1GYT5sgcqPlfPRDA1HkO+2gAbNFRdGj9Ol8M4PIho/xoSj2oEHGfAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"66f24412ea5425eaba4491946c58886f39d058d1a029743f2c43bc0d52e805c7","last_reissued_at":"2026-05-18T02:37:51.701725Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:37:51.701725Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Two-Stream Convolutional Networks for Action Recognition in Videos","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Andrew Zisserman, Karen Simonyan","submitted_at":"2014-06-09T14:44:14Z","abstract_excerpt":"We investigate architectures of discriminatively trained deep Convolutional Networks (ConvNets) for action recognition in video. The challenge is to capture the complementary information on appearance from still frames and motion between frames. We also aim to generalise the best performing hand-crafted features within a data-driven learning framework.\n  Our contribution is three-fold. First, we propose a two-stream ConvNet architecture which incorporates spatial and temporal networks. Second, we demonstrate that a ConvNet trained on multi-frame dense optical flow is able to achieve very good "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1406.2199","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1406.2199","created_at":"2026-05-18T02:37:51.701812+00:00"},{"alias_kind":"arxiv_version","alias_value":"1406.2199v2","created_at":"2026-05-18T02:37:51.701812+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1406.2199","created_at":"2026-05-18T02:37:51.701812+00:00"},{"alias_kind":"pith_short_12","alias_value":"M3ZEIEXKKQS6","created_at":"2026-05-18T12:28:38.356838+00:00"},{"alias_kind":"pith_short_16","alias_value":"M3ZEIEXKKQS6VOSE","created_at":"2026-05-18T12:28:38.356838+00:00"},{"alias_kind":"pith_short_8","alias_value":"M3ZEIEXK","created_at":"2026-05-18T12:28:38.356838+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2604.23415","citing_title":"A Heterogeneous Two-Stream Framework for Video Action Recognition with Comparative Fusion Analysis","ref_index":32,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/M3ZEIEXKKQS6VOSESGKGYWEIN4","json":"https://pith.science/pith/M3ZEIEXKKQS6VOSESGKGYWEIN4.json","graph_json":"https://pith.science/api/pith-number/M3ZEIEXKKQS6VOSESGKGYWEIN4/graph.json","events_json":"https://pith.science/api/pith-number/M3ZEIEXKKQS6VOSESGKGYWEIN4/events.json","paper":"https://pith.science/paper/M3ZEIEXK"},"agent_actions":{"view_html":"https://pith.science/pith/M3ZEIEXKKQS6VOSESGKGYWEIN4","download_json":"https://pith.science/pith/M3ZEIEXKKQS6VOSESGKGYWEIN4.json","view_paper":"https://pith.science/paper/M3ZEIEXK","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1406.2199&json=true","fetch_graph":"https://pith.science/api/pith-number/M3ZEIEXKKQS6VOSESGKGYWEIN4/graph.json","fetch_events":"https://pith.science/api/pith-number/M3ZEIEXKKQS6VOSESGKGYWEIN4/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/M3ZEIEXKKQS6VOSESGKGYWEIN4/action/timestamp_anchor","attest_storage":"https://pith.science/pith/M3ZEIEXKKQS6VOSESGKGYWEIN4/action/storage_attestation","attest_author":"https://pith.science/pith/M3ZEIEXKKQS6VOSESGKGYWEIN4/action/author_attestation","sign_citation":"https://pith.science/pith/M3ZEIEXKKQS6VOSESGKGYWEIN4/action/citation_signature","submit_replication":"https://pith.science/pith/M3ZEIEXKKQS6VOSESGKGYWEIN4/action/replication_record"}},"created_at":"2026-05-18T02:37:51.701812+00:00","updated_at":"2026-05-18T02:37:51.701812+00:00"}