{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:W4TPLI53LVIHYZ6UCVBRL3443S","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"94e18e05584083e5b1cbbe254aba4a9c4cbb4c6b8dc63d01ec220ef24117dc7a","cross_cats_sorted":["cs.CL","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-05-29T18:58:38Z","title_canon_sha256":"3c7150397808e13ff72bab5cc440587de27d0854e43b1fc23a199ea026595a24"},"schema_version":"1.0","source":{"id":"2305.18565","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2305.18565","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2305.18565v1","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2305.18565","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"W4TPLI53LVIH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"W4TPLI53LVIHYZ6U","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"W4TPLI53","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:e88a1d6ed96745e36afc9feb96476163df19dba9e233ccd7fae86eb9195474bb","target":"graph","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"PaLI-X advances the state-of-the-art on most vision-and-language benchmarks considered (25+ of them) and exhibits emerging capabilities such as complex counting and multilingual object detection."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That increasing model size and broadening the training task mixture will reliably produce both higher benchmark scores and the observed emergent behaviors without requiring task-specific fine-tuning or additional architectural changes."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Scaling a multilingual vision-language model in size and training breadth yields new state-of-the-art results on over 25 benchmarks plus emerging abilities in counting and multilingual detection."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Scaling up PaLI-X sets new state-of-the-art on most vision and language benchmarks and shows emergent capabilities."}],"snapshot_sha256":"9ad1cfef05ed1adf2e3fdf89e586509cc9603f986e096d420ec74015657cd5e6"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"9114177088637e7dacb004206e60967cf021ef82e55e271b66073aa58b9c86c5"},"paper":{"abstract_excerpt":"We present the training recipe and results of scaling up PaLI-X, a multilingual vision and language model, both in terms of size of the components and the breadth of its training task mixture. Our model achieves new levels of performance on a wide-range of varied and complex tasks, including multiple image-based captioning and question-answering tasks, image-based document understanding and few-shot (in-context) learning, as well as object detection, video question answering, and video captioning. PaLI-X advances the state-of-the-art on most vision-and-language benchmarks considered (25+ of th","authors_text":"AJ Piergiovanni, Alexander Kolesnikov, Andreas Peter Steiner, Anelia Angelova, Anurag Arnab, Arsha Nagrani, Austin Waters, Basil Mustafa, Bo Pang, Carlos Riquelme Ruiz, Ceslee Montgomery, Daniel Keysers, Daniel Salz, Filip Pavetic, Gang Li, Hexiang Hu, Ibrahim Alabdulmohsin, Jialin Wu, Josip Djolonga, Julien Amelot, Kenton Lee, Keran Rong, Lucas Beyer, Mandar Joshi, Mario Lucic, Marvin Ritter, Matthias Minderer, Michael Tschannen, Mojtaba Seyedhosseini, Mostafa Dehghani, Neil Houlsby, Paulina Pietrzyk, Piotr Padlewski, Radu Soricut, Sebastian Goodman, Siamak Shakeri, Soravit Changpinyo, Xiaohua Zhai, Xiao Wang, Xi Chen, Yang Li, Yi Tay, Yuanzhong Xu","cross_cats":["cs.CL","cs.LG"],"headline":"Scaling up PaLI-X sets new state-of-the-art on most vision and language benchmarks and shows emergent capabilities.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-05-29T18:58:38Z","title":"PaLI-X: On Scaling up a Multilingual Vision and Language Model"},"references":{"count":99,"internal_anchors":4,"resolved_work":99,"sample":[{"cited_arxiv_id":"2204.02311","doi":"","is_internal_anchor":true,"ref_index":1,"title":"PaLM: Scaling Language Modeling with Pathways","work_id":"a94f3ef7-2c49-4445-93fe-6ec16aafd966","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Tom B. Brown, Benjamin Mann, Nick Ryder, Jared Kaplan Melanie Subbiah, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-V oss, Gretche","work_id":"d5c09f2f-dcd2-437f-b8b9-cff70228c5b0","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"GLaM: Efficient scaling of language models with mixture-of-experts","work_id":"cc30ef45-7346-488a-9685-f77b6fa80ccc","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Dai, Orhan Firat, Melvin Johnson, Dmitry Lepikhin, Alexandre Passos, Siamak Shakeri, Emanuel Taropa, Paige Bailey, Zhifeng Chen, Eric Chu, Jonathan H","work_id":"2afad9fb-23ec-490f-bd19-fc208a680557","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"PaLI: A jointly-scaled multilingual language-image model","work_id":"8a12e27d-4f19-46bb-babd-b05a4c9e0fc8","year":2023}],"snapshot_sha256":"7f3c3e3bca5182bf876fd56ada3cb7974d5f89fa5a79eed1dfd9b8021c625fb3"},"source":{"id":"2305.18565","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-17T14:29:04.937460Z","id":"264574c8-6565-47c7-8d01-927db57ff00a","model_set":{"reader":"grok-4.3"},"one_line_summary":"Scaling a multilingual vision-language model in size and training breadth yields new state-of-the-art results on over 25 benchmarks plus emerging abilities in counting and multilingual detection.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Scaling up PaLI-X sets new state-of-the-art on most vision and language benchmarks and shows emergent capabilities.","strongest_claim":"PaLI-X advances the state-of-the-art on most vision-and-language benchmarks considered (25+ of them) and exhibits emerging capabilities such as complex counting and multilingual object detection.","weakest_assumption":"That increasing model size and broadening the training task mixture will reliably produce both higher benchmark scores and the observed emergent behaviors without requiring task-specific fine-tuning or additional architectural changes."}},"verdict_id":"264574c8-6565-47c7-8d01-927db57ff00a"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:598ed55d65c0903f26b156b9da8ac92ac67b9082d06e37b6619bce1bc0949fc5","target":"record","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"94e18e05584083e5b1cbbe254aba4a9c4cbb4c6b8dc63d01ec220ef24117dc7a","cross_cats_sorted":["cs.CL","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-05-29T18:58:38Z","title_canon_sha256":"3c7150397808e13ff72bab5cc440587de27d0854e43b1fc23a199ea026595a24"},"schema_version":"1.0","source":{"id":"2305.18565","kind":"arxiv","version":1}},"canonical_sha256":"b726f5a3bb5d507c67d4154315ef9cdc823a705c1f6de9de4a167c79eed8008a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b726f5a3bb5d507c67d4154315ef9cdc823a705c1f6de9de4a167c79eed8008a","first_computed_at":"2026-05-17T23:38:13.814348Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:13.814348Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"yHvzZQO8iLx5fyEV/8vyH13T4gwrKkV2Mm9wudT5kVc/svW27RbF5XJ+OfPzxUkeYle0vawi65nhE+UWYmQ2Bw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:13.815083Z","signed_message":"canonical_sha256_bytes"},"source_id":"2305.18565","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:598ed55d65c0903f26b156b9da8ac92ac67b9082d06e37b6619bce1bc0949fc5","sha256:e88a1d6ed96745e36afc9feb96476163df19dba9e233ccd7fae86eb9195474bb"],"state_sha256":"6e56feb9ad3c2b41bae8742ba5b5727b2ff61847cf06967bf704c72d828edef2"}