{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2021:2XX22VJUL7SIEBCQF3WFMGTECS","short_pith_number":"pith:2XX22VJU","canonical_record":{"source":{"id":"2111.07832","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2021-11-15T15:18:05Z","cross_cats_sorted":[],"title_canon_sha256":"c60bc85e27b6e89d8c0f68d113d4613cc285eca8d8a9681eed9b0379dc67c9d8","abstract_canon_sha256":"6e7728ee498ae978c428082bad3072a8254578a2aac25125ad12a37dae769d5b"},"schema_version":"1.0"},"canonical_sha256":"d5efad55345fe48204502eec561a6414a6779317b3fb51db2e0c3193230934d3","source":{"kind":"arxiv","id":"2111.07832","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2111.07832","created_at":"2026-05-18T02:42:36Z"},{"alias_kind":"arxiv_version","alias_value":"2111.07832v3","created_at":"2026-05-18T02:42:36Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2111.07832","created_at":"2026-05-18T02:42:36Z"},{"alias_kind":"pith_short_12","alias_value":"2XX22VJUL7SI","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"2XX22VJUL7SIEBCQ","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"2XX22VJU","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2021:2XX22VJUL7SIEBCQF3WFMGTECS","target":"record","payload":{"canonical_record":{"source":{"id":"2111.07832","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2021-11-15T15:18:05Z","cross_cats_sorted":[],"title_canon_sha256":"c60bc85e27b6e89d8c0f68d113d4613cc285eca8d8a9681eed9b0379dc67c9d8","abstract_canon_sha256":"6e7728ee498ae978c428082bad3072a8254578a2aac25125ad12a37dae769d5b"},"schema_version":"1.0"},"canonical_sha256":"d5efad55345fe48204502eec561a6414a6779317b3fb51db2e0c3193230934d3","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:42:36.996421Z","signature_b64":"r5cA3uqJ8fFKomZyrp3Du9IVYCoZZVBt14Lhlrpgyx0gaauZmJ9Cs7OTKbwjglQTq/r+hWt8WbRdRJPov/meCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d5efad55345fe48204502eec561a6414a6779317b3fb51db2e0c3193230934d3","last_reissued_at":"2026-05-18T02:42:36.995746Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:42:36.995746Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2111.07832","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:42:36Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"nGoQXErX5Fa79OxnPu1/4XWhdDRWtwY4BtawFmTrgmIi+8mRWOYEyO9x0oGqC53Mqc7gply1W2Fg9apin9q5CA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T11:24:37.446742Z"},"content_sha256":"2fa69649bffe4611434f0222a640fc1f34105ad16c00a1d208034b8a7e6a1b1d","schema_version":"1.0","event_id":"sha256:2fa69649bffe4611434f0222a640fc1f34105ad16c00a1d208034b8a7e6a1b1d"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2021:2XX22VJUL7SIEBCQF3WFMGTECS","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"iBOT: Image BERT Pre-Training with Online Tokenizer","license":"http://creativecommons.org/licenses/by/4.0/","headline":"iBOT uses a jointly learned online tokenizer for masked image modeling to reach 82.3 percent linear probing accuracy on ImageNet-1K.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Alan Yuille, Chen Wei, Cihang Xie, Huiyu Wang, Jinghao Zhou, Tao Kong, Wei Shen","submitted_at":"2021-11-15T15:18:05Z","abstract_excerpt":"The success of language Transformers is primarily attributed to the pretext task of masked language modeling (MLM), where texts are first tokenized into semantically meaningful pieces. In this work, we study masked image modeling (MIM) and indicate the advantages and challenges of using a semantically meaningful visual tokenizer. We present a self-supervised framework iBOT that can perform masked prediction with an online tokenizer. Specifically, we perform self-distillation on masked patch tokens and take the teacher network as the online tokenizer, along with self-distillation on the class t"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We present a self-supervised framework iBOT that can perform masked prediction with an online tokenizer... achieving an 82.3% linear probing accuracy and an 87.8% fine-tuning accuracy evaluated on ImageNet-1K.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that self-distillation with an online tokenizer can produce semantically meaningful visual tokens without prior pre-training of the tokenizer.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"iBOT achieves 82.3% linear probing accuracy and 87.8% fine-tuning accuracy on ImageNet-1K using masked image modeling with a jointly trained online tokenizer.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"iBOT uses a jointly learned online tokenizer for masked image modeling to reach 82.3 percent linear probing accuracy on ImageNet-1K.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"efb23a5c7423d8692304f14fe80d19ff23ce661cbbbf38de264e5edcb7363627"},"source":{"id":"2111.07832","kind":"arxiv","version":3},"verdict":{"id":"63891510-fb3d-4921-9600-2e7adc41f084","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T02:05:18.808809Z","strongest_claim":"We present a self-supervised framework iBOT that can perform masked prediction with an online tokenizer... achieving an 82.3% linear probing accuracy and an 87.8% fine-tuning accuracy evaluated on ImageNet-1K.","one_line_summary":"iBOT achieves 82.3% linear probing accuracy and 87.8% fine-tuning accuracy on ImageNet-1K using masked image modeling with a jointly trained online tokenizer.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that self-distillation with an online tokenizer can produce semantically meaningful visual tokens without prior pre-training of the tokenizer.","pith_extraction_headline":"iBOT uses a jointly learned online tokenizer for masked image modeling to reach 82.3 percent linear probing accuracy on ImageNet-1K."},"references":{"count":23,"sample":[{"doi":"","year":null,"title":"Self-supervised classiﬁcation network","work_id":"f0b10977-594c-4ab5-8f0d-d096f4f37bfa","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"SiT: Self-supervised vision transformer","work_id":"dfb6e938-6944-4a23-a4eb-bb29a3f1b178","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"BEiT: BERT Pre-Training of Image Transformers","work_id":"d74eda3c-bf7e-45f1-a8f1-a0137ecca3f4","ref_index":3,"cited_arxiv_id":"2106.08254","is_internal_anchor":true},{"doi":"","year":2022,"title":"10 Published as a conference paper at ICLR 2022 Kaiming He, Georgia Gkioxari, Piotr Doll´ar, and Ross Girshick. Mask R-CNN. In ICCV,","work_id":"fdf1d992-c87e-49d9-bd0b-b1c5ae3e69d0","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Efﬁcient self-supervised vision transformers for representation learning","work_id":"e1917362-f7ab-4e00-8f21-57a21257f2d9","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":23,"snapshot_sha256":"53eebe40e22818ce9af4f5a4f50f43873c05034b7d8c8d99ede456ebb1902ba5","internal_anchors":3},"formal_canon":{"evidence_count":3,"snapshot_sha256":"1242401ff0d023df60eb0eec7aa3921a6536baf868184aef253a4429cfd9d342"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"63891510-fb3d-4921-9600-2e7adc41f084"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:42:36Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"URD+aebujnvjdxhZrvXWkfDVuamSArRtiyXKaY3WpBAdIaEC9PulOqun365ThUeeVEvVYXy4X5tMxKoK048hAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T11:24:37.447837Z"},"content_sha256":"5c70e8b96a5c13872e83d3b05a2553e7c2730eab28c6a538e39268a5ecbf868b","schema_version":"1.0","event_id":"sha256:5c70e8b96a5c13872e83d3b05a2553e7c2730eab28c6a538e39268a5ecbf868b"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/2XX22VJUL7SIEBCQF3WFMGTECS/bundle.json","state_url":"https://pith.science/pith/2XX22VJUL7SIEBCQF3WFMGTECS/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/2XX22VJUL7SIEBCQF3WFMGTECS/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T11:24:37Z","links":{"resolver":"https://pith.science/pith/2XX22VJUL7SIEBCQF3WFMGTECS","bundle":"https://pith.science/pith/2XX22VJUL7SIEBCQF3WFMGTECS/bundle.json","state":"https://pith.science/pith/2XX22VJUL7SIEBCQF3WFMGTECS/state.json","well_known_bundle":"https://pith.science/.well-known/pith/2XX22VJUL7SIEBCQF3WFMGTECS/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2021:2XX22VJUL7SIEBCQF3WFMGTECS","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"6e7728ee498ae978c428082bad3072a8254578a2aac25125ad12a37dae769d5b","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2021-11-15T15:18:05Z","title_canon_sha256":"c60bc85e27b6e89d8c0f68d113d4613cc285eca8d8a9681eed9b0379dc67c9d8"},"schema_version":"1.0","source":{"id":"2111.07832","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2111.07832","created_at":"2026-05-18T02:42:36Z"},{"alias_kind":"arxiv_version","alias_value":"2111.07832v3","created_at":"2026-05-18T02:42:36Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2111.07832","created_at":"2026-05-18T02:42:36Z"},{"alias_kind":"pith_short_12","alias_value":"2XX22VJUL7SI","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"2XX22VJUL7SIEBCQ","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"2XX22VJU","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:5c70e8b96a5c13872e83d3b05a2553e7c2730eab28c6a538e39268a5ecbf868b","target":"graph","created_at":"2026-05-18T02:42:36Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We present a self-supervised framework iBOT that can perform masked prediction with an online tokenizer... achieving an 82.3% linear probing accuracy and an 87.8% fine-tuning accuracy evaluated on ImageNet-1K."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that self-distillation with an online tokenizer can produce semantically meaningful visual tokens without prior pre-training of the tokenizer."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"iBOT achieves 82.3% linear probing accuracy and 87.8% fine-tuning accuracy on ImageNet-1K using masked image modeling with a jointly trained online tokenizer."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"iBOT uses a jointly learned online tokenizer for masked image modeling to reach 82.3 percent linear probing accuracy on ImageNet-1K."}],"snapshot_sha256":"efb23a5c7423d8692304f14fe80d19ff23ce661cbbbf38de264e5edcb7363627"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"1242401ff0d023df60eb0eec7aa3921a6536baf868184aef253a4429cfd9d342"},"paper":{"abstract_excerpt":"The success of language Transformers is primarily attributed to the pretext task of masked language modeling (MLM), where texts are first tokenized into semantically meaningful pieces. In this work, we study masked image modeling (MIM) and indicate the advantages and challenges of using a semantically meaningful visual tokenizer. We present a self-supervised framework iBOT that can perform masked prediction with an online tokenizer. Specifically, we perform self-distillation on masked patch tokens and take the teacher network as the online tokenizer, along with self-distillation on the class t","authors_text":"Alan Yuille, Chen Wei, Cihang Xie, Huiyu Wang, Jinghao Zhou, Tao Kong, Wei Shen","cross_cats":[],"headline":"iBOT uses a jointly learned online tokenizer for masked image modeling to reach 82.3 percent linear probing accuracy on ImageNet-1K.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2021-11-15T15:18:05Z","title":"iBOT: Image BERT Pre-Training with Online Tokenizer"},"references":{"count":23,"internal_anchors":3,"resolved_work":23,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Self-supervised classiﬁcation network","work_id":"f0b10977-594c-4ab5-8f0d-d096f4f37bfa","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"SiT: Self-supervised vision transformer","work_id":"dfb6e938-6944-4a23-a4eb-bb29a3f1b178","year":null},{"cited_arxiv_id":"2106.08254","doi":"","is_internal_anchor":true,"ref_index":3,"title":"BEiT: BERT Pre-Training of Image Transformers","work_id":"d74eda3c-bf7e-45f1-a8f1-a0137ecca3f4","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"10 Published as a conference paper at ICLR 2022 Kaiming He, Georgia Gkioxari, Piotr Doll´ar, and Ross Girshick. Mask R-CNN. In ICCV,","work_id":"fdf1d992-c87e-49d9-bd0b-b1c5ae3e69d0","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Efﬁcient self-supervised vision transformers for representation learning","work_id":"e1917362-f7ab-4e00-8f21-57a21257f2d9","year":null}],"snapshot_sha256":"53eebe40e22818ce9af4f5a4f50f43873c05034b7d8c8d99ede456ebb1902ba5"},"source":{"id":"2111.07832","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-14T02:05:18.808809Z","id":"63891510-fb3d-4921-9600-2e7adc41f084","model_set":{"reader":"grok-4.3"},"one_line_summary":"iBOT achieves 82.3% linear probing accuracy and 87.8% fine-tuning accuracy on ImageNet-1K using masked image modeling with a jointly trained online tokenizer.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"iBOT uses a jointly learned online tokenizer for masked image modeling to reach 82.3 percent linear probing accuracy on ImageNet-1K.","strongest_claim":"We present a self-supervised framework iBOT that can perform masked prediction with an online tokenizer... achieving an 82.3% linear probing accuracy and an 87.8% fine-tuning accuracy evaluated on ImageNet-1K.","weakest_assumption":"The assumption that self-distillation with an online tokenizer can produce semantically meaningful visual tokens without prior pre-training of the tokenizer."}},"verdict_id":"63891510-fb3d-4921-9600-2e7adc41f084"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2fa69649bffe4611434f0222a640fc1f34105ad16c00a1d208034b8a7e6a1b1d","target":"record","created_at":"2026-05-18T02:42:36Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"6e7728ee498ae978c428082bad3072a8254578a2aac25125ad12a37dae769d5b","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2021-11-15T15:18:05Z","title_canon_sha256":"c60bc85e27b6e89d8c0f68d113d4613cc285eca8d8a9681eed9b0379dc67c9d8"},"schema_version":"1.0","source":{"id":"2111.07832","kind":"arxiv","version":3}},"canonical_sha256":"d5efad55345fe48204502eec561a6414a6779317b3fb51db2e0c3193230934d3","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"d5efad55345fe48204502eec561a6414a6779317b3fb51db2e0c3193230934d3","first_computed_at":"2026-05-18T02:42:36.995746Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:42:36.995746Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"r5cA3uqJ8fFKomZyrp3Du9IVYCoZZVBt14Lhlrpgyx0gaauZmJ9Cs7OTKbwjglQTq/r+hWt8WbRdRJPov/meCA==","signature_status":"signed_v1","signed_at":"2026-05-18T02:42:36.996421Z","signed_message":"canonical_sha256_bytes"},"source_id":"2111.07832","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2fa69649bffe4611434f0222a640fc1f34105ad16c00a1d208034b8a7e6a1b1d","sha256:5c70e8b96a5c13872e83d3b05a2553e7c2730eab28c6a538e39268a5ecbf868b"],"state_sha256":"33f8e9cce039034a562688057773b5d93056f92b44fe1855768fadbf6a75f7e5"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"yOKgihvADgmjXyugJatFd9Ueuq3zzBM1M3y6XRdqQbb4kUJ3uF14zdhDpwXuiVetH6HkUkFm72fgp/THclKjDQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T11:24:37.452043Z","bundle_sha256":"11a6693b038c7883c3099152dd3addb6b3784c9619fdd8890ad7edbb6b682599"}}