{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2021:5DCOF3VMGTZLMLR7G3WPGXB35H","short_pith_number":"pith:5DCOF3VM","canonical_record":{"source":{"id":"2104.14294","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2021-04-29T12:28:51Z","cross_cats_sorted":[],"title_canon_sha256":"6da5120c2f200ed38479654dc470ad04e74b90b6056c9f2193cbebf59211b2ef","abstract_canon_sha256":"e27e3a9f2795b58bc613cf7732e789f220f1f340ac0c88e34c09aebf18ca1c51"},"schema_version":"1.0"},"canonical_sha256":"e8c4e2eeac34f2b62e3f36ecf35c3be9dba589951a95e029295327006e6a9849","source":{"kind":"arxiv","id":"2104.14294","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2104.14294","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2104.14294v2","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2104.14294","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"5DCOF3VMGTZL","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"5DCOF3VMGTZLMLR7","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"5DCOF3VM","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2021:5DCOF3VMGTZLMLR7G3WPGXB35H","target":"record","payload":{"canonical_record":{"source":{"id":"2104.14294","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2021-04-29T12:28:51Z","cross_cats_sorted":[],"title_canon_sha256":"6da5120c2f200ed38479654dc470ad04e74b90b6056c9f2193cbebf59211b2ef","abstract_canon_sha256":"e27e3a9f2795b58bc613cf7732e789f220f1f340ac0c88e34c09aebf18ca1c51"},"schema_version":"1.0"},"canonical_sha256":"e8c4e2eeac34f2b62e3f36ecf35c3be9dba589951a95e029295327006e6a9849","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:47.667059Z","signature_b64":"i9CHGKQKsjfwkf2Nw8c3vDaExRLbqn7OvH1Cie9JGimfOFZ4pLFjLxiMsZabiTq1oX4lYSyPX93D6FkkVbfUAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e8c4e2eeac34f2b62e3f36ecf35c3be9dba589951a95e029295327006e6a9849","last_reissued_at":"2026-05-17T23:38:47.666608Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:47.666608Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2104.14294","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"SkENROosB2+0aWBe7te6jPWC0japPdVQl4Fz4YR8cfCRnUEd1FuoR6qA/EmPqV272h3PnQc8kU1Yvqo3mv0ZDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T06:33:40.637296Z"},"content_sha256":"a862602b02c0b8559ba54ee7cf2ac59acf536913cbf3c4ed779f0f1b68597178","schema_version":"1.0","event_id":"sha256:a862602b02c0b8559ba54ee7cf2ac59acf536913cbf3c4ed779f0f1b68597178"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2021:5DCOF3VMGTZLMLR7G3WPGXB35H","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Emerging Properties in Self-Supervised Vision Transformers","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Self-supervised Vision Transformers encode explicit semantic segmentation information in their features.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Armand Joulin, Herv\\'e J\\'egou, Hugo Touvron, Ishan Misra, Julien Mairal, Mathilde Caron, Piotr Bojanowski","submitted_at":"2021-04-29T12:28:51Z","abstract_excerpt":"In this paper, we question if self-supervised learning provides new properties to Vision Transformer (ViT) that stand out compared to convolutional networks (convnets). Beyond the fact that adapting self-supervised methods to this architecture works particularly well, we make the following observations: first, self-supervised ViT features contain explicit information about the semantic segmentation of an image, which does not emerge as clearly with supervised ViTs, nor with convnets. Second, these features are also excellent k-NN classifiers, reaching 78.3% top-1 on ImageNet with a small ViT. "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"self-supervised ViT features contain explicit information about the semantic segmentation of an image, which does not emerge as clearly with supervised ViTs, nor with convnets [...] achieving 80.1% top-1 on ImageNet in linear evaluation with ViT-Base.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that the observed semantic segmentation information and k-NN performance arise specifically from the interaction of self-supervision with the ViT architecture rather than from particular hyperparameter choices, dataset statistics, or evaluation protocols.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Self-supervised ViTs show emergent semantic segmentation and 78.3% k-NN accuracy on ImageNet; DINO reaches 80.1% linear evaluation with ViT-Base.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Self-supervised Vision Transformers encode explicit semantic segmentation information in their features.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"e7744828491c16518e765229c1e6b7f357b45db74bab277450dec402b8d5e640"},"source":{"id":"2104.14294","kind":"arxiv","version":2},"verdict":{"id":"558d59f9-ecae-48a5-a6aa-d48188c806ed","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T13:59:46.222058Z","strongest_claim":"self-supervised ViT features contain explicit information about the semantic segmentation of an image, which does not emerge as clearly with supervised ViTs, nor with convnets [...] achieving 80.1% top-1 on ImageNet in linear evaluation with ViT-Base.","one_line_summary":"Self-supervised ViTs show emergent semantic segmentation and 78.3% k-NN accuracy on ImageNet; DINO reaches 80.1% linear evaluation with ViT-Base.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that the observed semantic segmentation information and k-NN performance arise specifically from the interaction of self-supervision with the ViT architecture rather than from particular hyperparameter choices, dataset statistics, or evaluation protocols.","pith_extraction_headline":"Self-supervised Vision Transformers encode explicit semantic segmentation information in their features."},"references":{"count":85,"sample":[{"doi":"","year":2018,"title":"arXiv preprint arXiv:1804.03235 , year=","work_id":"eaf8922f-efbd-41b1-b29e-d4589ef7f01d","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"Self-labelling via simultaneous clustering and repre- sentation learning","work_id":"fcdc83a2-74ca-44ac-90cc-81fc3220fdba","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2006,"title":"preprint arXiv:2006.10803 , year=","work_id":"ff95292e-cd8e-4c2c-869c-f8c046cf5db7","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2014,"title":"Neural Machine Translation by Jointly Learning to Align and Translate","work_id":"d831e763-d530-4029-a65c-ac595d82cb2a","ref_index":4,"cited_arxiv_id":"1409.0473","is_internal_anchor":true},{"doi":"","year":1902,"title":"MultiGrain : a unified image embedding for classes and instances","work_id":"54472c12-9ee6-4a11-86b6-d57d3cfa0459","ref_index":5,"cited_arxiv_id":"1902.05509","is_internal_anchor":true}],"resolved_work":85,"snapshot_sha256":"a05ec25062deda5e9f77a8fe372fd781435d2a2bc718348de1151b8b3494e3d1","internal_anchors":17},"formal_canon":{"evidence_count":2,"snapshot_sha256":"13991ee97d01c10630ca894aec8db810cdc4dc60ddc1a4ca638ecc81a41bd46c"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"558d59f9-ecae-48a5-a6aa-d48188c806ed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"DmFYbqu3Npw1kRX3WdDZhHnLmM0OCXmkdc89eTrbRYekqaSwEwZsrxi6A3qnMxsDx/17GrKvYlpBvnZ61Bt5DA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T06:33:40.638195Z"},"content_sha256":"6005a6cccf502d2e99bbdd980a67403c6497c8a57cf8662c1208565c3258a35c","schema_version":"1.0","event_id":"sha256:6005a6cccf502d2e99bbdd980a67403c6497c8a57cf8662c1208565c3258a35c"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/5DCOF3VMGTZLMLR7G3WPGXB35H/bundle.json","state_url":"https://pith.science/pith/5DCOF3VMGTZLMLR7G3WPGXB35H/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/5DCOF3VMGTZLMLR7G3WPGXB35H/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-02T06:33:40Z","links":{"resolver":"https://pith.science/pith/5DCOF3VMGTZLMLR7G3WPGXB35H","bundle":"https://pith.science/pith/5DCOF3VMGTZLMLR7G3WPGXB35H/bundle.json","state":"https://pith.science/pith/5DCOF3VMGTZLMLR7G3WPGXB35H/state.json","well_known_bundle":"https://pith.science/.well-known/pith/5DCOF3VMGTZLMLR7G3WPGXB35H/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2021:5DCOF3VMGTZLMLR7G3WPGXB35H","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"e27e3a9f2795b58bc613cf7732e789f220f1f340ac0c88e34c09aebf18ca1c51","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2021-04-29T12:28:51Z","title_canon_sha256":"6da5120c2f200ed38479654dc470ad04e74b90b6056c9f2193cbebf59211b2ef"},"schema_version":"1.0","source":{"id":"2104.14294","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2104.14294","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2104.14294v2","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2104.14294","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"5DCOF3VMGTZL","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"5DCOF3VMGTZLMLR7","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"5DCOF3VM","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:6005a6cccf502d2e99bbdd980a67403c6497c8a57cf8662c1208565c3258a35c","target":"graph","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"self-supervised ViT features contain explicit information about the semantic segmentation of an image, which does not emerge as clearly with supervised ViTs, nor with convnets [...] achieving 80.1% top-1 on ImageNet in linear evaluation with ViT-Base."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that the observed semantic segmentation information and k-NN performance arise specifically from the interaction of self-supervision with the ViT architecture rather than from particular hyperparameter choices, dataset statistics, or evaluation protocols."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Self-supervised ViTs show emergent semantic segmentation and 78.3% k-NN accuracy on ImageNet; DINO reaches 80.1% linear evaluation with ViT-Base."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Self-supervised Vision Transformers encode explicit semantic segmentation information in their features."}],"snapshot_sha256":"e7744828491c16518e765229c1e6b7f357b45db74bab277450dec402b8d5e640"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"13991ee97d01c10630ca894aec8db810cdc4dc60ddc1a4ca638ecc81a41bd46c"},"paper":{"abstract_excerpt":"In this paper, we question if self-supervised learning provides new properties to Vision Transformer (ViT) that stand out compared to convolutional networks (convnets). Beyond the fact that adapting self-supervised methods to this architecture works particularly well, we make the following observations: first, self-supervised ViT features contain explicit information about the semantic segmentation of an image, which does not emerge as clearly with supervised ViTs, nor with convnets. Second, these features are also excellent k-NN classifiers, reaching 78.3% top-1 on ImageNet with a small ViT. ","authors_text":"Armand Joulin, Herv\\'e J\\'egou, Hugo Touvron, Ishan Misra, Julien Mairal, Mathilde Caron, Piotr Bojanowski","cross_cats":[],"headline":"Self-supervised Vision Transformers encode explicit semantic segmentation information in their features.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2021-04-29T12:28:51Z","title":"Emerging Properties in Self-Supervised Vision Transformers"},"references":{"count":85,"internal_anchors":17,"resolved_work":85,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"arXiv preprint arXiv:1804.03235 , year=","work_id":"eaf8922f-efbd-41b1-b29e-d4589ef7f01d","year":2018},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Self-labelling via simultaneous clustering and repre- sentation learning","work_id":"fcdc83a2-74ca-44ac-90cc-81fc3220fdba","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"preprint arXiv:2006.10803 , year=","work_id":"ff95292e-cd8e-4c2c-869c-f8c046cf5db7","year":2006},{"cited_arxiv_id":"1409.0473","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Neural Machine Translation by Jointly Learning to Align and Translate","work_id":"d831e763-d530-4029-a65c-ac595d82cb2a","year":2014},{"cited_arxiv_id":"1902.05509","doi":"","is_internal_anchor":true,"ref_index":5,"title":"MultiGrain : a unified image embedding for classes and instances","work_id":"54472c12-9ee6-4a11-86b6-d57d3cfa0459","year":1902}],"snapshot_sha256":"a05ec25062deda5e9f77a8fe372fd781435d2a2bc718348de1151b8b3494e3d1"},"source":{"id":"2104.14294","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T13:59:46.222058Z","id":"558d59f9-ecae-48a5-a6aa-d48188c806ed","model_set":{"reader":"grok-4.3"},"one_line_summary":"Self-supervised ViTs show emergent semantic segmentation and 78.3% k-NN accuracy on ImageNet; DINO reaches 80.1% linear evaluation with ViT-Base.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Self-supervised Vision Transformers encode explicit semantic segmentation information in their features.","strongest_claim":"self-supervised ViT features contain explicit information about the semantic segmentation of an image, which does not emerge as clearly with supervised ViTs, nor with convnets [...] achieving 80.1% top-1 on ImageNet in linear evaluation with ViT-Base.","weakest_assumption":"The assumption that the observed semantic segmentation information and k-NN performance arise specifically from the interaction of self-supervision with the ViT architecture rather than from particular hyperparameter choices, dataset statistics, or evaluation protocols."}},"verdict_id":"558d59f9-ecae-48a5-a6aa-d48188c806ed"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a862602b02c0b8559ba54ee7cf2ac59acf536913cbf3c4ed779f0f1b68597178","target":"record","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"e27e3a9f2795b58bc613cf7732e789f220f1f340ac0c88e34c09aebf18ca1c51","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2021-04-29T12:28:51Z","title_canon_sha256":"6da5120c2f200ed38479654dc470ad04e74b90b6056c9f2193cbebf59211b2ef"},"schema_version":"1.0","source":{"id":"2104.14294","kind":"arxiv","version":2}},"canonical_sha256":"e8c4e2eeac34f2b62e3f36ecf35c3be9dba589951a95e029295327006e6a9849","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"e8c4e2eeac34f2b62e3f36ecf35c3be9dba589951a95e029295327006e6a9849","first_computed_at":"2026-05-17T23:38:47.666608Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:47.666608Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"i9CHGKQKsjfwkf2Nw8c3vDaExRLbqn7OvH1Cie9JGimfOFZ4pLFjLxiMsZabiTq1oX4lYSyPX93D6FkkVbfUAA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:47.667059Z","signed_message":"canonical_sha256_bytes"},"source_id":"2104.14294","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a862602b02c0b8559ba54ee7cf2ac59acf536913cbf3c4ed779f0f1b68597178","sha256:6005a6cccf502d2e99bbdd980a67403c6497c8a57cf8662c1208565c3258a35c"],"state_sha256":"8e5c684b7553a16b1647b03cb622777a10c468996353822296484f90fc87ef03"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"GwuY6MJq4xVRCpiFoSnSSYxNhnW0JAd48WPcGEwafLYtTUFHuF1nyVP6QlB8lOYXZvxVthvxr+m8yP1bLnrJBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-02T06:33:40.641606Z","bundle_sha256":"50371f2558d64162d79a6fd7a1d0696f4e4dabfd2d5bd116adb7533b012a43f3"}}