{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2021:XUD5VQBEN3C3DCTDBP33PMPOTT","short_pith_number":"pith:XUD5VQBE","canonical_record":{"source":{"id":"2104.13921","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2021-04-28T17:58:57Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"14e6e262ae9f9fd75c3b326bb712106491e0ea98bba9e020cfd7a299b938345f","abstract_canon_sha256":"98315c67feae8fab3e6d382b299a6260eb382798eaff40304ac8565486c2c684"},"schema_version":"1.0"},"canonical_sha256":"bd07dac0246ec5b18a630bf7b7b1ee9cc56c013b7525f6fd7c3add0f3ee26684","source":{"kind":"arxiv","id":"2104.13921","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2104.13921","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2104.13921v3","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2104.13921","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"XUD5VQBEN3C3","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"XUD5VQBEN3C3DCTD","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"XUD5VQBE","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2021:XUD5VQBEN3C3DCTDBP33PMPOTT","target":"record","payload":{"canonical_record":{"source":{"id":"2104.13921","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2021-04-28T17:58:57Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"14e6e262ae9f9fd75c3b326bb712106491e0ea98bba9e020cfd7a299b938345f","abstract_canon_sha256":"98315c67feae8fab3e6d382b299a6260eb382798eaff40304ac8565486c2c684"},"schema_version":"1.0"},"canonical_sha256":"bd07dac0246ec5b18a630bf7b7b1ee9cc56c013b7525f6fd7c3add0f3ee26684","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.952546Z","signature_b64":"Rz4Jzu/r3nU9rT3cAEcyCq7CbuAaIlCxANgh9RouAlMgH1HTEyqCD5Mkj8zegZNvu1o6sbwJRlGvHhIaa34jBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"bd07dac0246ec5b18a630bf7b7b1ee9cc56c013b7525f6fd7c3add0f3ee26684","last_reissued_at":"2026-05-17T23:38:13.952040Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.952040Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2104.13921","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cn5EJASeeJhQqVZjBZ8G4xeNAZM4A7M8+xpptfHVBLI0iRhYpo5i2cabrjYa5g/EyzoXPtGf4Oa/SurdC7EiAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T21:34:33.940478Z"},"content_sha256":"38e1f1e91f4c2890e067386af4d89f838b0c6041708822f9db5ac01b81fc6d02","schema_version":"1.0","event_id":"sha256:38e1f1e91f4c2890e067386af4d89f838b0c6041708822f9db5ac01b81fc6d02"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2021:XUD5VQBEN3C3DCTDBP33PMPOTT","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Open-vocabulary Object Detection via Vision and Language Knowledge Distillation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A vision-language distillation method trains object detectors to recognize arbitrary text-described objects, including categories never seen in training.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CV","authors_text":"Tsung-Yi Lin, Weicheng Kuo, Xiuye Gu, Yin Cui","submitted_at":"2021-04-28T17:58:57Z","abstract_excerpt":"We aim at advancing open-vocabulary object detection, which detects objects described by arbitrary text inputs. The fundamental challenge is the availability of training data. It is costly to further scale up the number of classes contained in existing object detection datasets. To overcome this challenge, we propose ViLD, a training method via Vision and Language knowledge Distillation. Our method distills the knowledge from a pretrained open-vocabulary image classification model (teacher) into a two-stage detector (student). Specifically, we use the teacher model to encode category texts and"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"ViLD obtains 16.1 mask AP_r with a ResNet-50 backbone, even outperforming the supervised counterpart by 3.8. When trained with a stronger teacher model ALIGN, ViLD achieves 26.3 AP_r.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That embeddings produced by the teacher on image regions and category texts remain sufficiently aligned with the student's region proposals even for categories never seen during detector training.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"ViLD distills region and text embeddings from a teacher vision-language model into a student detector, enabling open-vocabulary detection that outperforms supervised baselines on held-out rare classes in LVIS and transfers to COCO, VOC, and Objects365.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A vision-language distillation method trains object detectors to recognize arbitrary text-described objects, including categories never seen in training.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"7d8e23e4be185107e5353e68d5c989d4de6580d8d198a849933cd726df1d6ccd"},"source":{"id":"2104.13921","kind":"arxiv","version":3},"verdict":{"id":"cc4f62de-02b7-4842-9bca-42323a2283d0","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T13:26:40.448557Z","strongest_claim":"ViLD obtains 16.1 mask AP_r with a ResNet-50 backbone, even outperforming the supervised counterpart by 3.8. When trained with a stronger teacher model ALIGN, ViLD achieves 26.3 AP_r.","one_line_summary":"ViLD distills region and text embeddings from a teacher vision-language model into a student detector, enabling open-vocabulary detection that outperforms supervised baselines on held-out rare classes in LVIS and transfers to COCO, VOC, and Objects365.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That embeddings produced by the teacher on image regions and category texts remain sufficiently aligned with the student's region proposals even for categories never seen during detector training.","pith_extraction_headline":"A vision-language distillation method trains object detectors to recognize arbitrary text-described objects, including categories never seen in training."},"references":{"count":15,"sample":[{"doi":"","year":2022,"title":"Lvis: A dataset for large vocabulary instance seg- mentation","work_id":"5ddcda50-e6ce-4b2c-a24e-bbfdf07b61fe","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2014,"title":"Zero shot recognition with unreliable attributes","work_id":"aaa5b1db-be6b-4f3a-aec0-c4d6a3a04645","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Faster r-cnn: Towards real-time object detection with region proposal networks","work_id":"1eef4e5b-94fd-46b5-9738-0bcb370c501d","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"Technical report: A good box is not a guarantee of a good mask.Joint COCO and LVIS workshop at ECCV 2020: LVIS Challenge Track,","work_id":"e67650a8-7071-4aa5-a9d6-93ef00b68721","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2011,"title":"The caltech-ucsd birds-200-2011 dataset","work_id":"17ded59a-8f55-47d9-be8f-3bb8450680df","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":15,"snapshot_sha256":"b319620f76aed8040b982eb85f2b43350806f7ca7ba2fca4191d6903792a016a","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"b7ae686b068be0f4fbb7e60c7111dfdc08a83a7f289e9063a66bc513a12c19c8"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"cc4f62de-02b7-4842-9bca-42323a2283d0"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"U9CjkasODLCvNPO2lwYjFWk2fQDelKl0kNdT0qK5VNkDbb6LjU5lTNJpkib26Em53iNyqus1WL3JKPE2GtYwCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T21:34:33.941312Z"},"content_sha256":"c6ea6129f43429c3eac4c8d24dd6b7139b321c6bba3c5c97e1da48a586200318","schema_version":"1.0","event_id":"sha256:c6ea6129f43429c3eac4c8d24dd6b7139b321c6bba3c5c97e1da48a586200318"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT/bundle.json","state_url":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/XUD5VQBEN3C3DCTDBP33PMPOTT/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-18T21:34:33Z","links":{"resolver":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT","bundle":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT/bundle.json","state":"https://pith.science/pith/XUD5VQBEN3C3DCTDBP33PMPOTT/state.json","well_known_bundle":"https://pith.science/.well-known/pith/XUD5VQBEN3C3DCTDBP33PMPOTT/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2021:XUD5VQBEN3C3DCTDBP33PMPOTT","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"98315c67feae8fab3e6d382b299a6260eb382798eaff40304ac8565486c2c684","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2021-04-28T17:58:57Z","title_canon_sha256":"14e6e262ae9f9fd75c3b326bb712106491e0ea98bba9e020cfd7a299b938345f"},"schema_version":"1.0","source":{"id":"2104.13921","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2104.13921","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2104.13921v3","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2104.13921","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"XUD5VQBEN3C3","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"XUD5VQBEN3C3DCTD","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"XUD5VQBE","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:c6ea6129f43429c3eac4c8d24dd6b7139b321c6bba3c5c97e1da48a586200318","target":"graph","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"ViLD obtains 16.1 mask AP_r with a ResNet-50 backbone, even outperforming the supervised counterpart by 3.8. When trained with a stronger teacher model ALIGN, ViLD achieves 26.3 AP_r."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That embeddings produced by the teacher on image regions and category texts remain sufficiently aligned with the student's region proposals even for categories never seen during detector training."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"ViLD distills region and text embeddings from a teacher vision-language model into a student detector, enabling open-vocabulary detection that outperforms supervised baselines on held-out rare classes in LVIS and transfers to COCO, VOC, and Objects365."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A vision-language distillation method trains object detectors to recognize arbitrary text-described objects, including categories never seen in training."}],"snapshot_sha256":"7d8e23e4be185107e5353e68d5c989d4de6580d8d198a849933cd726df1d6ccd"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"b7ae686b068be0f4fbb7e60c7111dfdc08a83a7f289e9063a66bc513a12c19c8"},"paper":{"abstract_excerpt":"We aim at advancing open-vocabulary object detection, which detects objects described by arbitrary text inputs. The fundamental challenge is the availability of training data. It is costly to further scale up the number of classes contained in existing object detection datasets. To overcome this challenge, we propose ViLD, a training method via Vision and Language knowledge Distillation. Our method distills the knowledge from a pretrained open-vocabulary image classification model (teacher) into a two-stage detector (student). Specifically, we use the teacher model to encode category texts and","authors_text":"Tsung-Yi Lin, Weicheng Kuo, Xiuye Gu, Yin Cui","cross_cats":["cs.AI","cs.LG"],"headline":"A vision-language distillation method trains object detectors to recognize arbitrary text-described objects, including categories never seen in training.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2021-04-28T17:58:57Z","title":"Open-vocabulary Object Detection via Vision and Language Knowledge Distillation"},"references":{"count":15,"internal_anchors":0,"resolved_work":15,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Lvis: A dataset for large vocabulary instance seg- mentation","work_id":"5ddcda50-e6ce-4b2c-a24e-bbfdf07b61fe","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Zero shot recognition with unreliable attributes","work_id":"aaa5b1db-be6b-4f3a-aec0-c4d6a3a04645","year":2014},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Faster r-cnn: Towards real-time object detection with region proposal networks","work_id":"1eef4e5b-94fd-46b5-9738-0bcb370c501d","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Technical report: A good box is not a guarantee of a good mask.Joint COCO and LVIS workshop at ECCV 2020: LVIS Challenge Track,","work_id":"e67650a8-7071-4aa5-a9d6-93ef00b68721","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"The caltech-ucsd birds-200-2011 dataset","work_id":"17ded59a-8f55-47d9-be8f-3bb8450680df","year":2011}],"snapshot_sha256":"b319620f76aed8040b982eb85f2b43350806f7ca7ba2fca4191d6903792a016a"},"source":{"id":"2104.13921","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-17T13:26:40.448557Z","id":"cc4f62de-02b7-4842-9bca-42323a2283d0","model_set":{"reader":"grok-4.3"},"one_line_summary":"ViLD distills region and text embeddings from a teacher vision-language model into a student detector, enabling open-vocabulary detection that outperforms supervised baselines on held-out rare classes in LVIS and transfers to COCO, VOC, and Objects365.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A vision-language distillation method trains object detectors to recognize arbitrary text-described objects, including categories never seen in training.","strongest_claim":"ViLD obtains 16.1 mask AP_r with a ResNet-50 backbone, even outperforming the supervised counterpart by 3.8. When trained with a stronger teacher model ALIGN, ViLD achieves 26.3 AP_r.","weakest_assumption":"That embeddings produced by the teacher on image regions and category texts remain sufficiently aligned with the student's region proposals even for categories never seen during detector training."}},"verdict_id":"cc4f62de-02b7-4842-9bca-42323a2283d0"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:38e1f1e91f4c2890e067386af4d89f838b0c6041708822f9db5ac01b81fc6d02","target":"record","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"98315c67feae8fab3e6d382b299a6260eb382798eaff40304ac8565486c2c684","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2021-04-28T17:58:57Z","title_canon_sha256":"14e6e262ae9f9fd75c3b326bb712106491e0ea98bba9e020cfd7a299b938345f"},"schema_version":"1.0","source":{"id":"2104.13921","kind":"arxiv","version":3}},"canonical_sha256":"bd07dac0246ec5b18a630bf7b7b1ee9cc56c013b7525f6fd7c3add0f3ee26684","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"bd07dac0246ec5b18a630bf7b7b1ee9cc56c013b7525f6fd7c3add0f3ee26684","first_computed_at":"2026-05-17T23:38:13.952040Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:13.952040Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"Rz4Jzu/r3nU9rT3cAEcyCq7CbuAaIlCxANgh9RouAlMgH1HTEyqCD5Mkj8zegZNvu1o6sbwJRlGvHhIaa34jBg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:13.952546Z","signed_message":"canonical_sha256_bytes"},"source_id":"2104.13921","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:38e1f1e91f4c2890e067386af4d89f838b0c6041708822f9db5ac01b81fc6d02","sha256:c6ea6129f43429c3eac4c8d24dd6b7139b321c6bba3c5c97e1da48a586200318"],"state_sha256":"fde29bece3bf3f6af5e0348eeb3868023978e59fc14912cccdde9fd1548d0c30"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"9mrNGAUVj2dCRHrB4F6BbZeAIxmwsavX4icuNVVv2FGMs4FoBriTpz4NUjzwbhcsNVkVtxOGeDhDU3nkD6CJBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-18T21:34:33.943714Z","bundle_sha256":"c498366cd90206be5cda9a9e3ffb97a74d4a14edaf8d241895e8f44a936d8e00"}}