{"work":{"id":"26b304e5-b54a-4f26-be7e-83299eca52e4","openalex_id":null,"doi":null,"arxiv_id":"2304.07193","raw_key":null,"title":"DINOv2: Learning Robust Visual Features without Supervision","authors":null,"authors_text":"Maxime Oquab, Timoth\\'ee Darcet, Th\\'eo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov","year":2023,"venue":"cs.CV","abstract":"The recent breakthroughs in natural language processing for model pretraining on large quantities of data have opened the way for similar foundation models in computer vision. These models could greatly simplify the use of images in any system by producing all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning. This work shows that existing pretraining methods, especially self-supervised methods, can produce such features if trained on enough curated data from diverse sources. We revisit existing approaches and combine different techniques to scale our pretraining in terms of data and model size. Most of the technical contributions aim at accelerating and stabilizing the training at scale. In terms of data, we propose an automatic pipeline to build a dedicated, diverse, and curated image dataset instead of uncurated data, as typically done in the self-supervised literature. In terms of models, we train a ViT model (Dosovitskiy et al., 2020) with 1B parameters and distill it into a series of smaller models that surpass the best available all-purpose features, OpenCLIP (Ilharco et al., 2021) on most of the benchmarks at image and pixel levels.","external_url":"https://arxiv.org/abs/2304.07193","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-29T18:33:50.489752+00:00","pith_arxiv_id":"2304.07193","created_at":"2026-05-08T18:23:55.776600+00:00","updated_at":"2026-06-29T18:33:50.489752+00:00","title_quality_ok":true,"display_title":"DINOv2: Learning Robust Visual Features without Supervision","render_title":"DINOv2: Learning Robust Visual Features without Supervision"},"hub":{"state":{"work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":601,"external_cited_by_count":null,"distinct_field_count":19,"first_pith_cited_at":"2023-05-24T17:59:04+00:00","last_pith_cited_at":"2026-06-24T05:58:03+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-29T18:39:07.278939+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"method","n":59},{"context_role":"background","n":57},{"context_role":"baseline","n":9},{"context_role":"dataset","n":3},{"context_role":"other","n":1}],"polarity_counts":[{"context_polarity":"background","n":57},{"context_polarity":"use_method","n":57},{"context_polarity":"baseline","n":9},{"context_polarity":"unclear","n":4},{"context_polarity":"use_dataset","n":2}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"DINOv2: Learning Robust Visual Features without Supervision","claims":[{"claim_text":"The recent breakthroughs in natural language processing for model pretraining on large quantities of data have opened the way for similar foundation models in computer vision. These models could greatly simplify the use of images in any system by producing all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning. This work shows that existing pretraining methods, especially self-supervised methods, can produce such features if trained on enough curated data from diverse sources. We revisit existing approaches and combine different techniques","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks DINOv2: Learning Robust Visual Features without Supervision because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T19:03:27.163402+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"6f0b0fee-fc8c-42aa-b744-c5d37829a086","orcid":null,"display_name":"Maxime Oquab"},{"id":"b3c2fff2-1fd0-45c4-a265-ba833fbc9115","orcid":null,"display_name":"Timoth\\'ee Darcet"},{"id":"2a2200a2-9d8d-4807-b668-79690b3172a1","orcid":null,"display_name":"Th\\'eo Moutakanni"},{"id":"3a748553-6877-4d14-a1b7-8779f8052b78","orcid":null,"display_name":"Huy Vo"},{"id":"efeba865-645b-4761-9405-e6f4bae79432","orcid":null,"display_name":"Marc Szafraniec"},{"id":"0072825c-7418-4dbe-b845-de490ec2080e","orcid":null,"display_name":"Vasil Khalidov"}]},"error":null,"updated_at":"2026-05-13T19:03:27.161035+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T18:53:32.193294+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"DINOv3","work_id":"c8b07deb-8fe7-4e18-9620-f3569d3529ce","shared_citers":56},{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","work_id":"e96730e3-129b-4db6-b981-15ab7932e297","shared_citers":53},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":33},{"title":"SAM 2: Segment Anything in Images and Videos","work_id":"acc13f66-d814-44f9-9688-375688bf2d4a","shared_citers":32},{"title":"SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features","work_id":"50eec732-2d41-432f-9dcf-ac7fff235ea5","shared_citers":31},{"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","shared_citers":22},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":22},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":21},{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":21},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":20},{"title":"Flow Matching for Generative Modeling","work_id":"6edb71c4-5d64-40af-a394-9757ea051a36","shared_citers":18},{"title":"OpenVLA: An Open-Source Vision-Language-Action Model","work_id":"3e7e65c5-5aed-4fe9-8414-2092bcb31cc7","shared_citers":18},{"title":"Auto-Encoding Variational Bayes","work_id":"97d95295-30e1-42b4-bbf6-85f0fa4edb44","shared_citers":17},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":16},{"title":"$\\pi_{0.5}$: a Vision-Language-Action Model with Open-World Generalization","work_id":"d1ad7304-d09a-49bc-809e-846439f6aff9","shared_citers":13},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":13},{"title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis","work_id":"8034c587-fba6-4941-87ba-c98f2ac962cb","shared_citers":13},{"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","shared_citers":13},{"title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer","work_id":"f38fc088-12aa-4bf4-9ecd-08d3e797ccb7","shared_citers":12},{"title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","work_id":"e2db69c7-ee8a-4cb7-a761-7b8de1dfcf97","shared_citers":12},{"title":"Representation Alignment for Generation: Training Diffusion Transformers Is Easier Than You Think","work_id":"1aff8ef8-079b-4afe-9e6a-148e6fd08e6a","shared_citers":12},{"title":"RT-1: Robotics Transformer for Real-World Control at Scale","work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","shared_citers":12},{"title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","work_id":"ff438a8a-8003-4fae-9131-acd418b3597b","shared_citers":12},{"title":"SAM 3: Segment Anything with Concepts","work_id":"4a72a006-2592-4554-aad0-a9c41a9f952d","shared_citers":12}],"time_series":[{"n":1,"year":2023},{"n":3,"year":2024},{"n":9,"year":2025},{"n":267,"year":2026}]},"error":null,"updated_at":"2026-05-13T19:03:26.502849+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T18:53:31.576325+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"DINOv2: Learning Robust Visual Features without Supervision","claims":[{"claim_text":"The recent breakthroughs in natural language processing for model pretraining on large quantities of data have opened the way for similar foundation models in computer vision. These models could greatly simplify the use of images in any system by producing all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning. This work shows that existing pretraining methods, especially self-supervised methods, can produce such features if trained on enough curated data from diverse sources. We revisit existing approaches and combine different techniques","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks DINOv2: Learning Robust Visual Features without Supervision because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T19:03:26.279046+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"DINOv2: Learning Robust Visual Features without Supervision","claims":[{"claim_text":"The recent breakthroughs in natural language processing for model pretraining on large quantities of data have opened the way for similar foundation models in computer vision. These models could greatly simplify the use of images in any system by producing all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning. This work shows that existing pretraining methods, especially self-supervised methods, can produce such features if trained on enough curated data from diverse sources. We revisit existing approaches and combine different techniques","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks DINOv2: Learning Robust Visual Features without Supervision because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T19:03:26.277174+00:00"}},"summary":{"title":"DINOv2: Learning Robust Visual Features without Supervision","claims":[{"claim_text":"The recent breakthroughs in natural language processing for model pretraining on large quantities of data have opened the way for similar foundation models in computer vision. These models could greatly simplify the use of images in any system by producing all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning. This work shows that existing pretraining methods, especially self-supervised methods, can produce such features if trained on enough curated data from diverse sources. We revisit existing approaches and combine different techniques","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks DINOv2: Learning Robust Visual Features without Supervision because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"DINOv3","work_id":"c8b07deb-8fe7-4e18-9620-f3569d3529ce","shared_citers":56},{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","work_id":"e96730e3-129b-4db6-b981-15ab7932e297","shared_citers":53},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":33},{"title":"SAM 2: Segment Anything in Images and Videos","work_id":"acc13f66-d814-44f9-9688-375688bf2d4a","shared_citers":32},{"title":"SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features","work_id":"50eec732-2d41-432f-9dcf-ac7fff235ea5","shared_citers":31},{"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","shared_citers":22},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":22},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":21},{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":21},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":20},{"title":"Flow Matching for Generative Modeling","work_id":"6edb71c4-5d64-40af-a394-9757ea051a36","shared_citers":18},{"title":"OpenVLA: An Open-Source Vision-Language-Action Model","work_id":"3e7e65c5-5aed-4fe9-8414-2092bcb31cc7","shared_citers":18},{"title":"Auto-Encoding Variational Bayes","work_id":"97d95295-30e1-42b4-bbf6-85f0fa4edb44","shared_citers":17},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":16},{"title":"$\\pi_{0.5}$: a Vision-Language-Action Model with Open-World Generalization","work_id":"d1ad7304-d09a-49bc-809e-846439f6aff9","shared_citers":13},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":13},{"title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis","work_id":"8034c587-fba6-4941-87ba-c98f2ac962cb","shared_citers":13},{"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","shared_citers":13},{"title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer","work_id":"f38fc088-12aa-4bf4-9ecd-08d3e797ccb7","shared_citers":12},{"title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","work_id":"e2db69c7-ee8a-4cb7-a761-7b8de1dfcf97","shared_citers":12},{"title":"Representation Alignment for Generation: Training Diffusion Transformers Is Easier Than You Think","work_id":"1aff8ef8-079b-4afe-9e6a-148e6fd08e6a","shared_citers":12},{"title":"RT-1: Robotics Transformer for Real-World Control at Scale","work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","shared_citers":12},{"title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","work_id":"ff438a8a-8003-4fae-9131-acd418b3597b","shared_citers":12},{"title":"SAM 3: Segment Anything with Concepts","work_id":"4a72a006-2592-4554-aad0-a9c41a9f952d","shared_citers":12}],"time_series":[{"n":1,"year":2023},{"n":3,"year":2024},{"n":9,"year":2025},{"n":267,"year":2026}]},"authors":[{"id":"3a748553-6877-4d14-a1b7-8779f8052b78","orcid":null,"display_name":"Huy Vo","source":"manual","import_confidence":0.72},{"id":"efeba865-645b-4761-9405-e6f4bae79432","orcid":null,"display_name":"Marc Szafraniec","source":"manual","import_confidence":0.72},{"id":"6f0b0fee-fc8c-42aa-b744-c5d37829a086","orcid":null,"display_name":"Maxime Oquab","source":"manual","import_confidence":0.72},{"id":"2a2200a2-9d8d-4807-b668-79690b3172a1","orcid":null,"display_name":"Th\\'eo Moutakanni","source":"manual","import_confidence":0.72},{"id":"b3c2fff2-1fd0-45c4-a265-ba833fbc9115","orcid":null,"display_name":"Timoth\\'ee Darcet","source":"manual","import_confidence":0.72},{"id":"0072825c-7418-4dbe-b845-de490ec2080e","orcid":null,"display_name":"Vasil Khalidov","source":"manual","import_confidence":0.72}]}}