{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:DULKBTOBJA4XIQFVFZQSJCUWTM","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"501a62ef5aaa69b608bba60f579b7b53b44da05a7c37f19dc796242d5c2856e2","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-04T18:50:42Z","title_canon_sha256":"db490a13d82e857cc0961af6be15e96b9c4e49e1742d092153959eaaaf28eacf"},"schema_version":"1.0","source":{"id":"2412.03555","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2412.03555","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"arxiv_version","alias_value":"2412.03555v1","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.03555","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"pith_short_12","alias_value":"DULKBTOBJA4X","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"DULKBTOBJA4XIQFV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"DULKBTOB","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:6bbbbfe90b09da0217f7a5a6d67bc40d57a3ee6bd28926166bf104db56f93578","target":"graph","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"PaliGemma 2 obtains state-of-the-art results on different OCR-related tasks such as table structure recognition, molecular structure recognition, music score recognition, as well as long fine-grained captioning and radiography report generation."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That multi-stage training at multiple resolutions equips the models with broad transferable knowledge; the abstract provides no controlled ablations or details on how this is verified versus simpler baselines."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"PaliGemma 2 is a family of vision-language models that achieves state-of-the-art results on transfer tasks like table structure recognition and radiography report generation by combining SigLIP with Gemma 2 models at various sizes and resolutions."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"PaliGemma 2 pairs Gemma 2 language models with SigLIP encoders and trains them at multiple resolutions to achieve strong transfer on OCR and captioning tasks."}],"snapshot_sha256":"4755c3abfb93bd345343458493b5a814eaee1c0bbc36412813d07e56ec8000bd"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"74b3134658523acba323f9b21695676acc3743d9fdef6a89637a3ea3354a1607"},"paper":{"abstract_excerpt":"PaliGemma 2 is an upgrade of the PaliGemma open Vision-Language Model (VLM) based on the Gemma 2 family of language models. We combine the SigLIP-So400m vision encoder that was also used by PaliGemma with the whole range of Gemma 2 models, from the 2B one all the way up to the 27B model. We train these models at three resolutions (224px, 448px, and 896px) in multiple stages to equip them with broad knowledge for transfer via fine-tuning. The resulting family of base models covering different model sizes and resolutions allows us to investigate factors impacting transfer performance (such as le","authors_text":"Alexey Gritsenko, Andreas Steiner, Andr\\'e Susano Pinto, Anthony Sherbondy, Daniel Keysers, Emanuele Bugliarello, Ibrahim Alabdulmohsin, Lucas Beyer, Matthias Minderer, Michael Tschannen, Reeve Ingle, Sahar Kazemzadeh, Shangbang Long, Siyang Qin, Thomas Mesnard, Xiaohua Zhai, Xiao Wang, Yonatan Bitton","cross_cats":[],"headline":"PaliGemma 2 pairs Gemma 2 language models with SigLIP encoders and trains them at multiple resolutions to achieve strong transfer on OCR and captioning tasks.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-04T18:50:42Z","title":"PaliGemma 2: A Family of Versatile VLMs for Transfer"},"references":{"count":113,"internal_anchors":13,"resolved_work":113,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"M. Acharya, K. Kafle, and C. Kanan. Tal- lyQA: Answering complex counting ques- tions. InAAAI, 2019","work_id":"abd3cdf7-6b70-45ad-af36-e39c3e976935","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"H. Agrawal, K. Desai, Y. Wang, X. Chen, R. Jain, M. Johnson, D. Batra, D. Parikh, S. Lee, and P. Anderson. NoCaps: Novel object captioning at scale. InICCV, 2019","work_id":"9c92bc55-33f1-436d-be7d-c34462bead7f","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"I. Alabdulmohsin, X. Zhai, A. Kolesnikov, and L. Beyer. Getting vit in shape: Scaling laws for compute-optimal model design. In NeurIPS, 2023","work_id":"032c297a-aead-419b-a05e-2dc732acd9b1","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"J.-B. Alayrac, J. Donahue, P. Luc, A. Miech, I. Barr, Y. Hasson, K. Lenc, A. Men- sch, K. Millican, M. Reynolds, R. Ring, E. Rutherford, S. Cabi, T. Han, Z. Gong, S. Samangooei, M. Monteiro, J. Menick","work_id":"dcd6eeb3-2800-427a-a51b-14d521d049e0","year":2022},{"cited_arxiv_id":"2308.12966","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","year":2023}],"snapshot_sha256":"a57dd2d510ec25ed2c6898eee33ffd97403e1f2d854e763515bd84be3e268509"},"source":{"id":"2412.03555","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T09:10:03.840284Z","id":"8151d1fc-65c0-4075-858e-030a31b7c61b","model_set":{"reader":"grok-4.3"},"one_line_summary":"PaliGemma 2 is a family of vision-language models that achieves state-of-the-art results on transfer tasks like table structure recognition and radiography report generation by combining SigLIP with Gemma 2 models at various sizes and resolutions.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"PaliGemma 2 pairs Gemma 2 language models with SigLIP encoders and trains them at multiple resolutions to achieve strong transfer on OCR and captioning tasks.","strongest_claim":"PaliGemma 2 obtains state-of-the-art results on different OCR-related tasks such as table structure recognition, molecular structure recognition, music score recognition, as well as long fine-grained captioning and radiography report generation.","weakest_assumption":"That multi-stage training at multiple resolutions equips the models with broad transferable knowledge; the abstract provides no controlled ablations or details on how this is verified versus simpler baselines."}},"verdict_id":"8151d1fc-65c0-4075-858e-030a31b7c61b"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a3e85a8544b96c85afa8c093506324dceddd3148b2311c9a2227a5c67aab63e6","target":"record","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"501a62ef5aaa69b608bba60f579b7b53b44da05a7c37f19dc796242d5c2856e2","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-04T18:50:42Z","title_canon_sha256":"db490a13d82e857cc0961af6be15e96b9c4e49e1742d092153959eaaaf28eacf"},"schema_version":"1.0","source":{"id":"2412.03555","kind":"arxiv","version":1}},"canonical_sha256":"1d16a0cdc148397440b52e61248a969b15fac2d9e1570c0a7906224e956eda27","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"1d16a0cdc148397440b52e61248a969b15fac2d9e1570c0a7906224e956eda27","first_computed_at":"2026-05-17T23:38:52.926181Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:52.926181Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"77mg6oFjm6mdmV3hLtwRRCm4wMxvMaevSiWtKXBOSOh58nlis6Ra8kOlaWhLtv8/4XK3joK1YrRpgW+q3s+LBg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:52.926803Z","signed_message":"canonical_sha256_bytes"},"source_id":"2412.03555","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a3e85a8544b96c85afa8c093506324dceddd3148b2311c9a2227a5c67aab63e6","sha256:6bbbbfe90b09da0217f7a5a6d67bc40d57a3ee6bd28926166bf104db56f93578"],"state_sha256":"52fcbda36656c41526bb144e878c5c8fb2c556ad5b41f60f725684888c96e911"}