{"work":{"id":"ad3e05b3-af3a-4fa2-ab30-c45f9f403277","openalex_id":null,"doi":null,"arxiv_id":null,"raw_key":"raw:188ac2e478afb485507d5ac5","title":"Learning transferable visual models from natural language supervision","authors":null,"authors_text":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al","year":2021,"venue":null,"abstract":null,"external_url":null,"cited_by_count":null,"metadata_source":"raw_reference","metadata_fetched_at":"2026-05-26T21:03:02.087526+00:00","pith_arxiv_id":null,"created_at":"2026-05-12T11:41:33.366286+00:00","updated_at":"2026-06-05T21:23:00.469572+00:00","title_quality_ok":true,"display_title":"Learning transferable visual models from natural language supervision","render_title":"Learning transferable visual models from natural language supervision"},"hub":{"state":{"work_id":"ad3e05b3-af3a-4fa2-ab30-c45f9f403277","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":94,"external_cited_by_count":null,"distinct_field_count":8,"first_pith_cited_at":"2025-05-25T18:33:05+00:00","last_pith_cited_at":"2026-05-22T01:21:45+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-29T15:19:00.350176+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":20},{"context_role":"method","n":10},{"context_role":"baseline","n":1},{"context_role":"dataset","n":1},{"context_role":"other","n":1}],"polarity_counts":[{"context_polarity":"background","n":20},{"context_polarity":"use_method","n":10},{"context_polarity":"baseline","n":1},{"context_polarity":"unclear","n":1},{"context_polarity":"use_dataset","n":1}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T18:46:24.731235+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":8},{"title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","work_id":"4819f738-f69f-49dd-8bed-404f647de63a","shared_citers":5},{"title":"Momentum contrast for unsupervised visual representation learning","work_id":"b728478f-a2c8-4b36-8c46-30ffbd4e1cd9","shared_citers":5},{"title":"Sigmoid loss for language image pre-training","work_id":"7d49d7f8-2cb8-4e91-8b78-0753271fb6a2","shared_citers":5},{"title":"Visual instruction tuning.Advances in neural information processing systems, 36:34892–34916","work_id":"115823a2-8918-4227-8872-3d0a36ff07a9","shared_citers":5},{"title":"A simple framework for contrastive learning of visual representations","work_id":"ef2113fc-d254-4c64-be2f-803754588eeb","shared_citers":4},{"title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","work_id":"b8f5e260-fff5-444e-bcf5-2c42cfefd83d","shared_citers":4},{"title":"Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi","work_id":"379af22d-41e1-4dcd-90ed-ba12a2ce31f5","shared_citers":4},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":4},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":3},{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","work_id":"e96730e3-129b-4db6-b981-15ab7932e297","shared_citers":3},{"title":"Are we on the right way for evaluating large vision-language models?Advances in Neural Information Processing Systems, 37:27056–27087","work_id":"cbee3b78-7e4d-43ad-9700-b93fccc00202","shared_citers":3},{"title":"Attention is all you need.Advances in neural information processing systems, 30","work_id":"751efe07-5e91-415c-b3d1-f4734aa26960","shared_citers":3},{"title":"Deep residual learning for image recognition","work_id":"9a00f23b-71b4-4f99-a115-0d30a296f178","shared_citers":3},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":3},{"title":"Denoising diffusion probabilistic models.Advances in neural information processing systems, 33:6840–6851","work_id":"82ba805b-3e59-43c6-b37f-3aa1940eea68","shared_citers":3},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":3},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":3},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":3},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":3},{"title":"Improved baselines with visual instruction tuning","work_id":"2431039a-0ca7-4c86-9e3b-0410e370605f","shared_citers":3},{"title":"Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks","work_id":"321b2bd4-950a-44f0-ab50-e70251e75187","shared_citers":3},{"title":"Learning multiple layers of features from tiny images","work_id":"3807822d-12bd-4f6f-89ab-c3132c0cbfff","shared_citers":3},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":3}],"time_series":[{"n":34,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T18:46:42.403923+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T18:46:29.287736+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Learning transferable visual models from natural language supervision","claims":[],"why_cited":"Pith tracks Learning transferable visual models from natural language supervision because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:46:24.817347+00:00"}},"summary":{"title":"Learning transferable visual models from natural language supervision","claims":[],"why_cited":"Pith tracks Learning transferable visual models from natural language supervision because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":8},{"title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","work_id":"4819f738-f69f-49dd-8bed-404f647de63a","shared_citers":5},{"title":"Momentum contrast for unsupervised visual representation learning","work_id":"b728478f-a2c8-4b36-8c46-30ffbd4e1cd9","shared_citers":5},{"title":"Sigmoid loss for language image pre-training","work_id":"7d49d7f8-2cb8-4e91-8b78-0753271fb6a2","shared_citers":5},{"title":"Visual instruction tuning.Advances in neural information processing systems, 36:34892–34916","work_id":"115823a2-8918-4227-8872-3d0a36ff07a9","shared_citers":5},{"title":"A simple framework for contrastive learning of visual representations","work_id":"ef2113fc-d254-4c64-be2f-803754588eeb","shared_citers":4},{"title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","work_id":"b8f5e260-fff5-444e-bcf5-2c42cfefd83d","shared_citers":4},{"title":"Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi","work_id":"379af22d-41e1-4dcd-90ed-ba12a2ce31f5","shared_citers":4},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":4},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":3},{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","work_id":"e96730e3-129b-4db6-b981-15ab7932e297","shared_citers":3},{"title":"Are we on the right way for evaluating large vision-language models?Advances in Neural Information Processing Systems, 37:27056–27087","work_id":"cbee3b78-7e4d-43ad-9700-b93fccc00202","shared_citers":3},{"title":"Attention is all you need.Advances in neural information processing systems, 30","work_id":"751efe07-5e91-415c-b3d1-f4734aa26960","shared_citers":3},{"title":"Deep residual learning for image recognition","work_id":"9a00f23b-71b4-4f99-a115-0d30a296f178","shared_citers":3},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":3},{"title":"Denoising diffusion probabilistic models.Advances in neural information processing systems, 33:6840–6851","work_id":"82ba805b-3e59-43c6-b37f-3aa1940eea68","shared_citers":3},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":3},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":3},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":3},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":3},{"title":"Improved baselines with visual instruction tuning","work_id":"2431039a-0ca7-4c86-9e3b-0410e370605f","shared_citers":3},{"title":"Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks","work_id":"321b2bd4-950a-44f0-ab50-e70251e75187","shared_citers":3},{"title":"Learning multiple layers of features from tiny images","work_id":"3807822d-12bd-4f6f-89ab-c3132c0cbfff","shared_citers":3},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":3}],"time_series":[{"n":34,"year":2026}],"dependency_candidates":[]},"authors":[]}}