{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2021:XUD5VQBEN3C3DCTDBP33PMPOTT","short_pith_number":"pith:XUD5VQBE","schema_version":"1.0","canonical_sha256":"bd07dac0246ec5b18a630bf7b7b1ee9cc56c013b7525f6fd7c3add0f3ee26684","source":{"kind":"arxiv","id":"2104.13921","version":3},"attestation_state":"computed","paper":{"title":"Open-vocabulary Object Detection via Vision and Language Knowledge Distillation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A vision-language distillation method trains object detectors to recognize arbitrary text-described objects, including categories never seen in training.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CV","authors_text":"Tsung-Yi Lin, Weicheng Kuo, Xiuye Gu, Yin Cui","submitted_at":"2021-04-28T17:58:57Z","abstract_excerpt":"We aim at advancing open-vocabulary object detection, which detects objects described by arbitrary text inputs. The fundamental challenge is the availability of training data. It is costly to further scale up the number of classes contained in existing object detection datasets. To overcome this challenge, we propose ViLD, a training method via Vision and Language knowledge Distillation. Our method distills the knowledge from a pretrained open-vocabulary image classification model (teacher) into a two-stage detector (student). Specifically, we use the teacher model to encode category texts and"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2104.13921","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2021-04-28T17:58:57Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"14e6e262ae9f9fd75c3b326bb712106491e0ea98bba9e020cfd7a299b938345f","abstract_canon_sha256":"98315c67feae8fab3e6d382b299a6260eb382798eaff40304ac8565486c2c684"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.952546Z","signature_b64":"Rz4Jzu/r3nU9rT3cAEcyCq7CbuAaIlCxANgh9RouAlMgH1HTEyqCD5Mkj8zegZNvu1o6sbwJRlGvHhIaa34jBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"bd07dac0246ec5b18a630bf7b7b1ee9cc56c013b7525f6fd7c3add0f3ee26684","last_reissued_at":"2026-05-17T23:38:13.952040Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.952040Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Open-vocabulary Object Detection via Vision and Language Knowledge Distillation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A vision-language distillation method trains object detectors to recognize arbitrary text-described objects, including categories never seen in training.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CV","authors_text":"Tsung-Yi Lin, Weicheng Kuo, Xiuye Gu, Yin Cui","submitted_at":"2021-04-28T17:58:57Z","abstract_excerpt":"We aim at advancing open-vocabulary object detection, which detects objects described by arbitrary text inputs. The fundamental challenge is the availability of training data. It is costly to further scale up the number of classes contained in existing object detection datasets. To overcome this challenge, we propose ViLD, a training method via Vision and Language knowledge Distillation. Our method distills the knowledge from a pretrained open-vocabulary image classification model (teacher) into a two-stage detector (student). Specifically, we use the teacher model to encode category texts and"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"ViLD obtains 16.1 mask AP_r with a ResNet-50 backbone, even outperforming the supervised counterpart by 3.8. When trained with a stronger teacher model ALIGN, ViLD achieves 26.3 AP_r.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That embeddings produced by the teacher on image regions and category texts remain sufficiently aligned with the student's region proposals even for categories never seen during detector training.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"ViLD distills region and text embeddings from a teacher vision-language model into a student detector, enabling open-vocabulary detection that outperforms supervised baselines on held-out rare classes in LVIS and transfers to COCO, VOC, and Objects365.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A vision-language distillation method trains object detectors to recognize arbitrary text-described objects, including categories never seen in training.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"7d8e23e4be185107e5353e68d5c989d4de6580d8d198a849933cd726df1d6ccd"},"source":{"id":"2104.13921","kind":"arxiv","version":3},"verdict":{"id":"cc4f62de-02b7-4842-9bca-42323a2283d0","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T13:26:40.448557Z","strongest_claim":"ViLD obtains 16.1 mask AP_r with a ResNet-50 backbone, even outperforming the supervised counterpart by 3.8. When trained with a stronger teacher model ALIGN, ViLD achieves 26.3 AP_r.","one_line_summary":"ViLD distills region and text embeddings from a teacher vision-language model into a student detector, enabling open-vocabulary detection that outperforms supervised baselines on held-out rare classes in LVIS and transfers to COCO, VOC, and Objects365.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That embeddings produced by the teacher on image regions and category texts remain sufficiently aligned with the student's region proposals even for categories never seen during detector training.","pith_extraction_headline":"A vision-language distillation method trains object detectors to recognize arbitrary text-described objects, including categories never seen in training."},"references":{"count":15,"sample":[{"doi":"","year":2022,"title":"Lvis: A dataset for large vocabulary instance seg- mentation","work_id":"5ddcda50-e6ce-4b2c-a24e-bbfdf07b61fe","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2014,"title":"Zero shot recognition with unreliable attributes","work_id":"aaa5b1db-be6b-4f3a-aec0-c4d6a3a04645","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Faster r-cnn: Towards real-time object detection with region proposal networks","work_id":"1eef4e5b-94fd-46b5-9738-0bcb370c501d","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"Technical report: A good box is not a guarantee of a good mask.Joint COCO and LVIS workshop at ECCV 2020: LVIS Challenge Track,","work_id":"e67650a8-7071-4aa5-a9d6-93ef00b68721","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2011,"title":"The caltech-ucsd birds-200-2011 dataset","work_id":"17ded59a-8f55-47d9-be8f-3bb8450680df","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":15,"snapshot_sha256":"b319620f76aed8040b982eb85f2b43350806f7ca7ba2fca4191d6903792a016a","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"b7ae686b068be0f4fbb7e60c7111dfdc08a83a7f289e9063a66bc513a12c19c8"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2104.13921","created_at":"2026-05-17T23:38:13.952123+00:00"},{"alias_kind":"arxiv_version","alias_value":"2104.13921v3","created_at":"2026-05-17T23:38:13.952123+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2104.13921","created_at":"2026-05-17T23:38:13.952123+00:00"},{"alias_kind":"pith_short_12","alias_value":"XUD5VQBEN3C3","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"XUD5VQBEN3C3DCTD","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"XUD5VQBE","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":19,"internal_anchor_count":19,"sample":[{"citing_arxiv_id":"2510.08278","citing_title":"A Multimodal Depth-Aware Method For Embodied Reference Understanding","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2511.16719","citing_title":"SAM 3: Segment Anything with Concepts","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2511.21064","citing_title":"OVOD-Agent: A Markov-Bandit Framework for Proactive Visual Reasoning and Self-Evolving Detection","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2201.03546","citing_title":"Language-driven Semantic Segmentation","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2204.00598","citing_title":"Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language","ref_index":65,"is_internal_anchor":true},{"citing_arxiv_id":"2209.07753","citing_title":"Code as Policies: Language Model Programs for Embodied Control","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02753","citing_title":"DeCo-DETR: Decoupled Cognition DETR for efficient Open-Vocabulary Object Detection","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2604.00503","citing_title":"PET-DINO: Unifying Visual Cues into Grounding DINO with Prompt-Enriched Training","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02753","citing_title":"DeCo-DETR: Decoupled Cognition DETR for efficient Open-Vocabulary Object Detection","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2307.05973","citing_title":"VoxPoser: Composable 3D Value Maps for Robotic Manipulation with Language Models","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11771","citing_title":"Revisiting Shadow Detection from a Vision-Language Perspective","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2310.11441","citing_title":"Set-of-Mark Prompting Unleashes Extraordinary Visual Grounding in GPT-4V","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03642","citing_title":"The Detector Teaches Itself: Lightweight Self-Supervised Adaptation for Open-Vocabulary Object Detection","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26250","citing_title":"Beyond Shortcuts: Mitigating Visual Illusions in Frozen VLMs via Qualitative Reasoning","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24575","citing_title":"Diffusion Model as a Generalist Segmentation Learner","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2207.05608","citing_title":"Inner Monologue: Embodied Reasoning through Planning with Language Models","ref_index":77,"is_internal_anchor":true},{"citing_arxiv_id":"2204.01691","citing_title":"Do As I Can, Not As I Say: Grounding Language in Robotic Affordances","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13722","citing_title":"Granularity-Aware Transfer for Tree Instance Segmentation in Synthetic and Real Forests","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14684","citing_title":"DETR-ViP: Detection Transformer with Robust Discriminative Visual Prompts","ref_index":4,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT","json":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT.json","graph_json":"https://pith.science/api/pith-number/XUD5VQBEN3C3DCTDBP33PMPOTT/graph.json","events_json":"https://pith.science/api/pith-number/XUD5VQBEN3C3DCTDBP33PMPOTT/events.json","paper":"https://pith.science/paper/XUD5VQBE"},"agent_actions":{"view_html":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT","download_json":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT.json","view_paper":"https://pith.science/paper/XUD5VQBE","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2104.13921&json=true","fetch_graph":"https://pith.science/api/pith-number/XUD5VQBEN3C3DCTDBP33PMPOTT/graph.json","fetch_events":"https://pith.science/api/pith-number/XUD5VQBEN3C3DCTDBP33PMPOTT/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT/action/storage_attestation","attest_author":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT/action/author_attestation","sign_citation":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT/action/citation_signature","submit_replication":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT/action/replication_record"}},"created_at":"2026-05-17T23:38:13.952123+00:00","updated_at":"2026-05-17T23:38:13.952123+00:00"}