{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2015:42GXE2YQY5W72NWZEP7NV7M526","short_pith_number":"pith:42GXE2YQ","canonical_record":{"source":{"id":"1505.00687","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2015-05-04T15:50:53Z","cross_cats_sorted":[],"title_canon_sha256":"8b7d0664d7518da36b8abc1ce40189146c9c16053ce895702ad52cae771a5f20","abstract_canon_sha256":"33a6496c3501eab0e8cd6293443ebc67e28014ae64aa33eccb0c8e6ee6b34fdd"},"schema_version":"1.0"},"canonical_sha256":"e68d726b10c76dfd36d923fedafd9dd79f060b95d4639a84228884ef5bebceb3","source":{"kind":"arxiv","id":"1505.00687","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1505.00687","created_at":"2026-05-18T01:31:02Z"},{"alias_kind":"arxiv_version","alias_value":"1505.00687v2","created_at":"2026-05-18T01:31:02Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1505.00687","created_at":"2026-05-18T01:31:02Z"},{"alias_kind":"pith_short_12","alias_value":"42GXE2YQY5W7","created_at":"2026-05-18T12:29:02Z"},{"alias_kind":"pith_short_16","alias_value":"42GXE2YQY5W72NWZ","created_at":"2026-05-18T12:29:02Z"},{"alias_kind":"pith_short_8","alias_value":"42GXE2YQ","created_at":"2026-05-18T12:29:02Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2015:42GXE2YQY5W72NWZEP7NV7M526","target":"record","payload":{"canonical_record":{"source":{"id":"1505.00687","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2015-05-04T15:50:53Z","cross_cats_sorted":[],"title_canon_sha256":"8b7d0664d7518da36b8abc1ce40189146c9c16053ce895702ad52cae771a5f20","abstract_canon_sha256":"33a6496c3501eab0e8cd6293443ebc67e28014ae64aa33eccb0c8e6ee6b34fdd"},"schema_version":"1.0"},"canonical_sha256":"e68d726b10c76dfd36d923fedafd9dd79f060b95d4639a84228884ef5bebceb3","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:31:02.908859Z","signature_b64":"MXJFfromaom3FC1cs2qhBPOKUWV3gOECKwezVKZ/jSiygtWM2eZcxPInnnakkh8LPZa4+lmmkXQEMjzKXzvLBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e68d726b10c76dfd36d923fedafd9dd79f060b95d4639a84228884ef5bebceb3","last_reissued_at":"2026-05-18T01:31:02.908182Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:31:02.908182Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1505.00687","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T01:31:02Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"dXvugOrtRBFkq0kQVoRNf90L1EQ3BX0m5UDpVIOj69+LH/E/4/AVrqC9ilj3UcsA1Q0WaV21MeTbs2aJvY8tBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T14:22:32.173541Z"},"content_sha256":"c4b7b522d522919e409fd2de7df690fb9ed428c714d03d8a6438fd16ce5cdcc6","schema_version":"1.0","event_id":"sha256:c4b7b522d522919e409fd2de7df690fb9ed428c714d03d8a6438fd16ce5cdcc6"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2015:42GXE2YQY5W72NWZEP7NV7M526","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Unsupervised Learning of Visual Representations using Videos","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Abhinav Gupta, Xiaolong Wang","submitted_at":"2015-05-04T15:50:53Z","abstract_excerpt":"Is strong supervision necessary for learning a good visual representation? Do we really need millions of semantically-labeled images to train a Convolutional Neural Network (CNN)? In this paper, we present a simple yet surprisingly powerful approach for unsupervised learning of CNN. Specifically, we use hundreds of thousands of unlabeled videos from the web to learn visual representations. Our key idea is that visual tracking provides the supervision. That is, two patches connected by a track should have similar visual representation in deep feature space since they probably belong to the same"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1505.00687","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T01:31:02Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"3ToAselUrVTMY4R24OdovVNkDCNsA7ge+T+Hq+TXKga6DV7DsKS3Z/Sh1PjRKofMAb3tx/2Lq0Tb1XMRAZcIAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T14:22:32.174219Z"},"content_sha256":"5c6abf314832998e87bfc9b24477d41d4691f976ba371978d41d7e3d2f13dac4","schema_version":"1.0","event_id":"sha256:5c6abf314832998e87bfc9b24477d41d4691f976ba371978d41d7e3d2f13dac4"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/42GXE2YQY5W72NWZEP7NV7M526/bundle.json","state_url":"https://pith.science/pith/42GXE2YQY5W72NWZEP7NV7M526/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/42GXE2YQY5W72NWZEP7NV7M526/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T14:22:32Z","links":{"resolver":"https://pith.science/pith/42GXE2YQY5W72NWZEP7NV7M526","bundle":"https://pith.science/pith/42GXE2YQY5W72NWZEP7NV7M526/bundle.json","state":"https://pith.science/pith/42GXE2YQY5W72NWZEP7NV7M526/state.json","well_known_bundle":"https://pith.science/.well-known/pith/42GXE2YQY5W72NWZEP7NV7M526/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2015:42GXE2YQY5W72NWZEP7NV7M526","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"33a6496c3501eab0e8cd6293443ebc67e28014ae64aa33eccb0c8e6ee6b34fdd","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2015-05-04T15:50:53Z","title_canon_sha256":"8b7d0664d7518da36b8abc1ce40189146c9c16053ce895702ad52cae771a5f20"},"schema_version":"1.0","source":{"id":"1505.00687","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1505.00687","created_at":"2026-05-18T01:31:02Z"},{"alias_kind":"arxiv_version","alias_value":"1505.00687v2","created_at":"2026-05-18T01:31:02Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1505.00687","created_at":"2026-05-18T01:31:02Z"},{"alias_kind":"pith_short_12","alias_value":"42GXE2YQY5W7","created_at":"2026-05-18T12:29:02Z"},{"alias_kind":"pith_short_16","alias_value":"42GXE2YQY5W72NWZ","created_at":"2026-05-18T12:29:02Z"},{"alias_kind":"pith_short_8","alias_value":"42GXE2YQ","created_at":"2026-05-18T12:29:02Z"}],"graph_snapshots":[{"event_id":"sha256:5c6abf314832998e87bfc9b24477d41d4691f976ba371978d41d7e3d2f13dac4","target":"graph","created_at":"2026-05-18T01:31:02Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Is strong supervision necessary for learning a good visual representation? Do we really need millions of semantically-labeled images to train a Convolutional Neural Network (CNN)? In this paper, we present a simple yet surprisingly powerful approach for unsupervised learning of CNN. Specifically, we use hundreds of thousands of unlabeled videos from the web to learn visual representations. Our key idea is that visual tracking provides the supervision. That is, two patches connected by a track should have similar visual representation in deep feature space since they probably belong to the same","authors_text":"Abhinav Gupta, Xiaolong Wang","cross_cats":[],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2015-05-04T15:50:53Z","title":"Unsupervised Learning of Visual Representations using Videos"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1505.00687","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:c4b7b522d522919e409fd2de7df690fb9ed428c714d03d8a6438fd16ce5cdcc6","target":"record","created_at":"2026-05-18T01:31:02Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"33a6496c3501eab0e8cd6293443ebc67e28014ae64aa33eccb0c8e6ee6b34fdd","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2015-05-04T15:50:53Z","title_canon_sha256":"8b7d0664d7518da36b8abc1ce40189146c9c16053ce895702ad52cae771a5f20"},"schema_version":"1.0","source":{"id":"1505.00687","kind":"arxiv","version":2}},"canonical_sha256":"e68d726b10c76dfd36d923fedafd9dd79f060b95d4639a84228884ef5bebceb3","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"e68d726b10c76dfd36d923fedafd9dd79f060b95d4639a84228884ef5bebceb3","first_computed_at":"2026-05-18T01:31:02.908182Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T01:31:02.908182Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"MXJFfromaom3FC1cs2qhBPOKUWV3gOECKwezVKZ/jSiygtWM2eZcxPInnnakkh8LPZa4+lmmkXQEMjzKXzvLBA==","signature_status":"signed_v1","signed_at":"2026-05-18T01:31:02.908859Z","signed_message":"canonical_sha256_bytes"},"source_id":"1505.00687","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:c4b7b522d522919e409fd2de7df690fb9ed428c714d03d8a6438fd16ce5cdcc6","sha256:5c6abf314832998e87bfc9b24477d41d4691f976ba371978d41d7e3d2f13dac4"],"state_sha256":"e4a0cb6043dd7458ac3c3a046c2ec66cd2f1e608d07eaace1845ed6dbe478d4d"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"58yK8PoQ1/YtHoro50HhMYSZAtNEKsAdEup1c5gLg9u7KxlueCMhnekxfXBoUoOHukpgAzywA7FNvAc7G+S8DA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T14:22:32.179236Z","bundle_sha256":"d014709ab60ebce8eb5d1981f20d82ad9185104d37256deb8c199a1d8e1125b3"}}