{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:X5N46CMJETB4TB5ED6TJANC2FV","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"cc0332e47d7937841046a3090d20ffdd67b32b1eb76a47acb7de21d23a708af6","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-03-27T15:53:01Z","title_canon_sha256":"7f4a056490916a9af113a28c02097641876e16d9ab2879231c7be248053b6bdb"},"schema_version":"1.0","source":{"id":"2303.15343","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2303.15343","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2303.15343v4","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2303.15343","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"X5N46CMJETB4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"X5N46CMJETB4TB5E","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"X5N46CMJ","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:a1072a782e82357e321d2baa176b48574105f3780c1793a0a47b3c3b69f1248d","target":"graph","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Combined with Locked-image Tuning, with only four TPUv4 chips, we train a SigLiT model that achieves 84.5% ImageNet zero-shot accuracy in two days."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the sigmoid loss, which forgoes global batch normalization, will continue to produce high-quality representations when scaled to new datasets or model sizes without additional hyper-parameter tuning."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SigLIP replaces softmax-based contrastive loss with a simple pairwise sigmoid loss for vision-language pre-training, decoupling batch size from normalization and reaching strong zero-shot performance with limited compute."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A pairwise sigmoid loss for image-text pre-training achieves 84.5% zero-shot ImageNet accuracy using only four TPU chips in two days."}],"snapshot_sha256":"d80ef83f6df86463442678c0aff55eadcfaffab928244d2d897958b5ce0d08b4"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"We propose a simple pairwise Sigmoid loss for Language-Image Pre-training (SigLIP). Unlike standard contrastive learning with softmax normalization, the sigmoid loss operates solely on image-text pairs and does not require a global view of the pairwise similarities for normalization. The sigmoid loss simultaneously allows further scaling up the batch size, while also performing better at smaller batch sizes. Combined with Locked-image Tuning, with only four TPUv4 chips, we train a SigLiT model that achieves 84.5% ImageNet zero-shot accuracy in two days. The disentanglement of the batch size fr","authors_text":"Alexander Kolesnikov, Basil Mustafa, Lucas Beyer, Xiaohua Zhai","cross_cats":["cs.AI"],"headline":"A pairwise sigmoid loss for image-text pre-training achieves 84.5% zero-shot ImageNet accuracy using only four TPU chips in two days.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-03-27T15:53:01Z","title":"Sigmoid Loss for Language Image Pre-Training"},"references":{"count":60,"internal_anchors":9,"resolved_work":60,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Getting vit in shape: Scaling laws for compute-optimal model design","work_id":"3d85a12f-9454-4f5c-bca9-b96d474ddde2","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"ObjectNet: A large-scale bias-controlled dataset for pushing the limits of object recognition models","work_id":"2f994755-ea11-439a-a510-79be7aa13443","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Are we done with imagenet?","work_id":"9efae043-283b-44ae-8324-207d3747f93f","year":2006},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Bet- ter plain vit baselines for imagenet-1k, 2022","work_id":"8b408975-f8cf-4010-ace0-d6cd6ac702ec","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Lucas Beyer, Xiaohua Zhai, and Alexander Kolesnikov. Big vision. https://github.com/google-research/ big_vision, 2022. 10, 17","work_id":"96277c24-f45f-4e02-b1c2-2e713c7788c7","year":2022}],"snapshot_sha256":"beed77e7aaad0bc528e634f9078674503ab1cdd1b8cf0caad84708ba4148c8e0"},"source":{"id":"2303.15343","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-16T13:00:19.151560Z","id":"7220d1d9-574d-4d8a-a61f-c6040846dd57","model_set":{"reader":"grok-4.3"},"one_line_summary":"SigLIP replaces softmax-based contrastive loss with a simple pairwise sigmoid loss for vision-language pre-training, decoupling batch size from normalization and reaching strong zero-shot performance with limited compute.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A pairwise sigmoid loss for image-text pre-training achieves 84.5% zero-shot ImageNet accuracy using only four TPU chips in two days.","strongest_claim":"Combined with Locked-image Tuning, with only four TPUv4 chips, we train a SigLiT model that achieves 84.5% ImageNet zero-shot accuracy in two days.","weakest_assumption":"That the sigmoid loss, which forgoes global batch normalization, will continue to produce high-quality representations when scaled to new datasets or model sizes without additional hyper-parameter tuning."}},"verdict_id":"7220d1d9-574d-4d8a-a61f-c6040846dd57"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:6b8e180f7e3260b8952ba3933ca81437ef8459335358755c825438d7f0a69673","target":"record","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"cc0332e47d7937841046a3090d20ffdd67b32b1eb76a47acb7de21d23a708af6","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-03-27T15:53:01Z","title_canon_sha256":"7f4a056490916a9af113a28c02097641876e16d9ab2879231c7be248053b6bdb"},"schema_version":"1.0","source":{"id":"2303.15343","kind":"arxiv","version":4}},"canonical_sha256":"bf5bcf098924c3c987a41fa690345a2d4c3f37b27b81749215a8edb78e20ed8c","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"bf5bcf098924c3c987a41fa690345a2d4c3f37b27b81749215a8edb78e20ed8c","first_computed_at":"2026-05-17T23:38:47.785603Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:47.785603Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"nGnGFCYMhIikKsby995XPS6hdDYqbDY/tyy0U4q70ifbffFx5gp92nO8cvLfhYb2MgHtuP6TsDF70kBAW92MAw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:47.786092Z","signed_message":"canonical_sha256_bytes"},"source_id":"2303.15343","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:6b8e180f7e3260b8952ba3933ca81437ef8459335358755c825438d7f0a69673","sha256:a1072a782e82357e321d2baa176b48574105f3780c1793a0a47b3c3b69f1248d"],"state_sha256":"35d540a5e5e77042f8d82f4e28391be89e711adaa0d2326098198a2decac5868"}