{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:X5N46CMJETB4TB5ED6TJANC2FV","short_pith_number":"pith:X5N46CMJ","canonical_record":{"source":{"id":"2303.15343","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-03-27T15:53:01Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"7f4a056490916a9af113a28c02097641876e16d9ab2879231c7be248053b6bdb","abstract_canon_sha256":"cc0332e47d7937841046a3090d20ffdd67b32b1eb76a47acb7de21d23a708af6"},"schema_version":"1.0"},"canonical_sha256":"bf5bcf098924c3c987a41fa690345a2d4c3f37b27b81749215a8edb78e20ed8c","source":{"kind":"arxiv","id":"2303.15343","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2303.15343","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2303.15343v4","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2303.15343","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"X5N46CMJETB4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"X5N46CMJETB4TB5E","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"X5N46CMJ","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:X5N46CMJETB4TB5ED6TJANC2FV","target":"record","payload":{"canonical_record":{"source":{"id":"2303.15343","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-03-27T15:53:01Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"7f4a056490916a9af113a28c02097641876e16d9ab2879231c7be248053b6bdb","abstract_canon_sha256":"cc0332e47d7937841046a3090d20ffdd67b32b1eb76a47acb7de21d23a708af6"},"schema_version":"1.0"},"canonical_sha256":"bf5bcf098924c3c987a41fa690345a2d4c3f37b27b81749215a8edb78e20ed8c","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:47.786092Z","signature_b64":"nGnGFCYMhIikKsby995XPS6hdDYqbDY/tyy0U4q70ifbffFx5gp92nO8cvLfhYb2MgHtuP6TsDF70kBAW92MAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"bf5bcf098924c3c987a41fa690345a2d4c3f37b27b81749215a8edb78e20ed8c","last_reissued_at":"2026-05-17T23:38:47.785603Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:47.785603Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2303.15343","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Qnae5wMpGf9xX6z+v+y3zQxAszCx6OAWJV+8DcbfwPfBTJAeRiYhB4UKzQwgMWj6/auAWdVLCSAtySfWgWaFDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T08:39:47.013778Z"},"content_sha256":"6b8e180f7e3260b8952ba3933ca81437ef8459335358755c825438d7f0a69673","schema_version":"1.0","event_id":"sha256:6b8e180f7e3260b8952ba3933ca81437ef8459335358755c825438d7f0a69673"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:X5N46CMJETB4TB5ED6TJANC2FV","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Sigmoid Loss for Language Image Pre-Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A pairwise sigmoid loss for image-text pre-training achieves 84.5% zero-shot ImageNet accuracy using only four TPU chips in two days.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Alexander Kolesnikov, Basil Mustafa, Lucas Beyer, Xiaohua Zhai","submitted_at":"2023-03-27T15:53:01Z","abstract_excerpt":"We propose a simple pairwise Sigmoid loss for Language-Image Pre-training (SigLIP). Unlike standard contrastive learning with softmax normalization, the sigmoid loss operates solely on image-text pairs and does not require a global view of the pairwise similarities for normalization. The sigmoid loss simultaneously allows further scaling up the batch size, while also performing better at smaller batch sizes. Combined with Locked-image Tuning, with only four TPUv4 chips, we train a SigLiT model that achieves 84.5% ImageNet zero-shot accuracy in two days. The disentanglement of the batch size fr"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Combined with Locked-image Tuning, with only four TPUv4 chips, we train a SigLiT model that achieves 84.5% ImageNet zero-shot accuracy in two days.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the sigmoid loss, which forgoes global batch normalization, will continue to produce high-quality representations when scaled to new datasets or model sizes without additional hyper-parameter tuning.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SigLIP replaces softmax-based contrastive loss with a simple pairwise sigmoid loss for vision-language pre-training, decoupling batch size from normalization and reaching strong zero-shot performance with limited compute.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A pairwise sigmoid loss for image-text pre-training achieves 84.5% zero-shot ImageNet accuracy using only four TPU chips in two days.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"d80ef83f6df86463442678c0aff55eadcfaffab928244d2d897958b5ce0d08b4"},"source":{"id":"2303.15343","kind":"arxiv","version":4},"verdict":{"id":"7220d1d9-574d-4d8a-a61f-c6040846dd57","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T13:00:19.151560Z","strongest_claim":"Combined with Locked-image Tuning, with only four TPUv4 chips, we train a SigLiT model that achieves 84.5% ImageNet zero-shot accuracy in two days.","one_line_summary":"SigLIP replaces softmax-based contrastive loss with a simple pairwise sigmoid loss for vision-language pre-training, decoupling batch size from normalization and reaching strong zero-shot performance with limited compute.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the sigmoid loss, which forgoes global batch normalization, will continue to produce high-quality representations when scaled to new datasets or model sizes without additional hyper-parameter tuning.","pith_extraction_headline":"A pairwise sigmoid loss for image-text pre-training achieves 84.5% zero-shot ImageNet accuracy using only four TPU chips in two days."},"references":{"count":60,"sample":[{"doi":"","year":2023,"title":"Getting vit in shape: Scaling laws for compute-optimal model design","work_id":"3d85a12f-9454-4f5c-bca9-b96d474ddde2","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2019,"title":"ObjectNet: A large-scale bias-controlled dataset for pushing the limits of object recognition models","work_id":"2f994755-ea11-439a-a510-79be7aa13443","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2006,"title":"Are we done with imagenet?","work_id":"9efae043-283b-44ae-8324-207d3747f93f","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Bet- ter plain vit baselines for imagenet-1k, 2022","work_id":"8b408975-f8cf-4010-ace0-d6cd6ac702ec","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Lucas Beyer, Xiaohua Zhai, and Alexander Kolesnikov. Big vision. https://github.com/google-research/ big_vision, 2022. 10, 17","work_id":"96277c24-f45f-4e02-b1c2-2e713c7788c7","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":60,"snapshot_sha256":"beed77e7aaad0bc528e634f9078674503ab1cdd1b8cf0caad84708ba4148c8e0","internal_anchors":9},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"7220d1d9-574d-4d8a-a61f-c6040846dd57"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"FP26jR8ocAK3zorjP9hjf3veGR1yaY2XZ9R5azdKfWBt0dzGi0HQW0LfExKAXj7zTkVOu6BOpJ0SHnL6BRatAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T08:39:47.014697Z"},"content_sha256":"a1072a782e82357e321d2baa176b48574105f3780c1793a0a47b3c3b69f1248d","schema_version":"1.0","event_id":"sha256:a1072a782e82357e321d2baa176b48574105f3780c1793a0a47b3c3b69f1248d"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/X5N46CMJETB4TB5ED6TJANC2FV/bundle.json","state_url":"https://pith.science/pith/X5N46CMJETB4TB5ED6TJANC2FV/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/X5N46CMJETB4TB5ED6TJANC2FV/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-01T08:39:47Z","links":{"resolver":"https://pith.science/pith/X5N46CMJETB4TB5ED6TJANC2FV","bundle":"https://pith.science/pith/X5N46CMJETB4TB5ED6TJANC2FV/bundle.json","state":"https://pith.science/pith/X5N46CMJETB4TB5ED6TJANC2FV/state.json","well_known_bundle":"https://pith.science/.well-known/pith/X5N46CMJETB4TB5ED6TJANC2FV/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:X5N46CMJETB4TB5ED6TJANC2FV","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"cc0332e47d7937841046a3090d20ffdd67b32b1eb76a47acb7de21d23a708af6","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-03-27T15:53:01Z","title_canon_sha256":"7f4a056490916a9af113a28c02097641876e16d9ab2879231c7be248053b6bdb"},"schema_version":"1.0","source":{"id":"2303.15343","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2303.15343","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2303.15343v4","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2303.15343","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"X5N46CMJETB4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"X5N46CMJETB4TB5E","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"X5N46CMJ","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:a1072a782e82357e321d2baa176b48574105f3780c1793a0a47b3c3b69f1248d","target":"graph","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Combined with Locked-image Tuning, with only four TPUv4 chips, we train a SigLiT model that achieves 84.5% ImageNet zero-shot accuracy in two days."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the sigmoid loss, which forgoes global batch normalization, will continue to produce high-quality representations when scaled to new datasets or model sizes without additional hyper-parameter tuning."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SigLIP replaces softmax-based contrastive loss with a simple pairwise sigmoid loss for vision-language pre-training, decoupling batch size from normalization and reaching strong zero-shot performance with limited compute."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A pairwise sigmoid loss for image-text pre-training achieves 84.5% zero-shot ImageNet accuracy using only four TPU chips in two days."}],"snapshot_sha256":"d80ef83f6df86463442678c0aff55eadcfaffab928244d2d897958b5ce0d08b4"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"We propose a simple pairwise Sigmoid loss for Language-Image Pre-training (SigLIP). Unlike standard contrastive learning with softmax normalization, the sigmoid loss operates solely on image-text pairs and does not require a global view of the pairwise similarities for normalization. The sigmoid loss simultaneously allows further scaling up the batch size, while also performing better at smaller batch sizes. Combined with Locked-image Tuning, with only four TPUv4 chips, we train a SigLiT model that achieves 84.5% ImageNet zero-shot accuracy in two days. The disentanglement of the batch size fr","authors_text":"Alexander Kolesnikov, Basil Mustafa, Lucas Beyer, Xiaohua Zhai","cross_cats":["cs.AI"],"headline":"A pairwise sigmoid loss for image-text pre-training achieves 84.5% zero-shot ImageNet accuracy using only four TPU chips in two days.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-03-27T15:53:01Z","title":"Sigmoid Loss for Language Image Pre-Training"},"references":{"count":60,"internal_anchors":9,"resolved_work":60,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Getting vit in shape: Scaling laws for compute-optimal model design","work_id":"3d85a12f-9454-4f5c-bca9-b96d474ddde2","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"ObjectNet: A large-scale bias-controlled dataset for pushing the limits of object recognition models","work_id":"2f994755-ea11-439a-a510-79be7aa13443","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Are we done with imagenet?","work_id":"9efae043-283b-44ae-8324-207d3747f93f","year":2006},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Bet- ter plain vit baselines for imagenet-1k, 2022","work_id":"8b408975-f8cf-4010-ace0-d6cd6ac702ec","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Lucas Beyer, Xiaohua Zhai, and Alexander Kolesnikov. Big vision. https://github.com/google-research/ big_vision, 2022. 10, 17","work_id":"96277c24-f45f-4e02-b1c2-2e713c7788c7","year":2022}],"snapshot_sha256":"beed77e7aaad0bc528e634f9078674503ab1cdd1b8cf0caad84708ba4148c8e0"},"source":{"id":"2303.15343","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-16T13:00:19.151560Z","id":"7220d1d9-574d-4d8a-a61f-c6040846dd57","model_set":{"reader":"grok-4.3"},"one_line_summary":"SigLIP replaces softmax-based contrastive loss with a simple pairwise sigmoid loss for vision-language pre-training, decoupling batch size from normalization and reaching strong zero-shot performance with limited compute.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A pairwise sigmoid loss for image-text pre-training achieves 84.5% zero-shot ImageNet accuracy using only four TPU chips in two days.","strongest_claim":"Combined with Locked-image Tuning, with only four TPUv4 chips, we train a SigLiT model that achieves 84.5% ImageNet zero-shot accuracy in two days.","weakest_assumption":"That the sigmoid loss, which forgoes global batch normalization, will continue to produce high-quality representations when scaled to new datasets or model sizes without additional hyper-parameter tuning."}},"verdict_id":"7220d1d9-574d-4d8a-a61f-c6040846dd57"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:6b8e180f7e3260b8952ba3933ca81437ef8459335358755c825438d7f0a69673","target":"record","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"cc0332e47d7937841046a3090d20ffdd67b32b1eb76a47acb7de21d23a708af6","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-03-27T15:53:01Z","title_canon_sha256":"7f4a056490916a9af113a28c02097641876e16d9ab2879231c7be248053b6bdb"},"schema_version":"1.0","source":{"id":"2303.15343","kind":"arxiv","version":4}},"canonical_sha256":"bf5bcf098924c3c987a41fa690345a2d4c3f37b27b81749215a8edb78e20ed8c","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"bf5bcf098924c3c987a41fa690345a2d4c3f37b27b81749215a8edb78e20ed8c","first_computed_at":"2026-05-17T23:38:47.785603Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:47.785603Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"nGnGFCYMhIikKsby995XPS6hdDYqbDY/tyy0U4q70ifbffFx5gp92nO8cvLfhYb2MgHtuP6TsDF70kBAW92MAw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:47.786092Z","signed_message":"canonical_sha256_bytes"},"source_id":"2303.15343","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:6b8e180f7e3260b8952ba3933ca81437ef8459335358755c825438d7f0a69673","sha256:a1072a782e82357e321d2baa176b48574105f3780c1793a0a47b3c3b69f1248d"],"state_sha256":"35d540a5e5e77042f8d82f4e28391be89e711adaa0d2326098198a2decac5868"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"WWz7EmkqfX58kPI5iCaW/zzw6ySE5yMvGhu20Y6pnGaOoY90/NfpNx1yA2Oi6Re5e2BxYijBBOaUC4DmmvJmAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-01T08:39:47.017090Z","bundle_sha256":"3bdfb3bdae2d78fd8796e29c3d9d0425d0c012e7402baccb788e620708db29e9"}}