{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:RNLXT3JAGTMBVMZZMRC3AOGBQP","short_pith_number":"pith:RNLXT3JA","canonical_record":{"source":{"id":"2605.14028","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T18:38:51Z","cross_cats_sorted":[],"title_canon_sha256":"a07c99d9289337019c324237fce5d3d64bca7473b659b40301115ca0128b9263","abstract_canon_sha256":"603b2c502b29c231ff054044abb6165cf7addcfe829fdc9742523fdcb110dc9e"},"schema_version":"1.0"},"canonical_sha256":"8b5779ed2034d81ab3396445b038c183e8e66da75bafbd54f17df70b94c91f3a","source":{"kind":"arxiv","id":"2605.14028","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14028","created_at":"2026-05-17T23:39:12Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14028v1","created_at":"2026-05-17T23:39:12Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14028","created_at":"2026-05-17T23:39:12Z"},{"alias_kind":"pith_short_12","alias_value":"RNLXT3JAGTMB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"RNLXT3JAGTMBVMZZ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"RNLXT3JA","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:RNLXT3JAGTMBVMZZMRC3AOGBQP","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14028","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T18:38:51Z","cross_cats_sorted":[],"title_canon_sha256":"a07c99d9289337019c324237fce5d3d64bca7473b659b40301115ca0128b9263","abstract_canon_sha256":"603b2c502b29c231ff054044abb6165cf7addcfe829fdc9742523fdcb110dc9e"},"schema_version":"1.0"},"canonical_sha256":"8b5779ed2034d81ab3396445b038c183e8e66da75bafbd54f17df70b94c91f3a","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:12.875869Z","signature_b64":"0zL5BpH2k9sETbacpE/Ynp4BZebYr15qZNhuVSfOcmV3hT/9K7Uv8miyc6a7hBq0C81RD5CT6aM/nmcqf/f7BA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8b5779ed2034d81ab3396445b038c183e8e66da75bafbd54f17df70b94c91f3a","last_reissued_at":"2026-05-17T23:39:12.875309Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:12.875309Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14028","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:12Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"8n4e0BTAxKTfsH8CXA6KUzgCySmn9H5NbfuywPOlw9EesknW/6E0hF7tmXZNwbJg/euH2OJCf7TNvxqM4HiMCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T09:14:39.521672Z"},"content_sha256":"18f2f7708f6715ca8a5a5b2c253ae615051efd7eaa0c1f0e8b41d9f53afb0dbf","schema_version":"1.0","event_id":"sha256:18f2f7708f6715ca8a5a5b2c253ae615051efd7eaa0c1f0e8b41d9f53afb0dbf"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:RNLXT3JAGTMBVMZZMRC3AOGBQP","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Unified Pix Token And Word Token Generative Language Model","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A new generative language model assigns each image pixel its own token to unify visual and textual inputs.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Haun Leung, Zinan Wang","submitted_at":"2026-05-13T18:38:51Z","abstract_excerpt":"Since the emergence of Vision Transformer (ViT), it has been widely used in generative language model and generative visual model. Especially in the current state-of-art open source multimodal models, ViT obtained by CLIP or SigLIP method serves as the vision encoder backbone to help them acquire visual understanding capabilities. But this method leads to limitations in visual understanding for details, such as difficulty in recognizing small text or numbers in images. To address these issues, we propose a new model to unify pix token and word token into the generative language model. The new "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"The new model unifies pix token and word token into the generative language model... The experimental results show that it has good performance even in small model and with limited training data.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That assigning each pixel its own token embedding plus the added color folding and global conditional attention approximation will produce meaningfully better visual detail understanding than existing patch-based encoders, without any quantitative comparison or ablation shown in the provided text.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"A new model unifies per-pixel and word tokens in a generative language model with per-pixel embeddings, color folding, and unsupervised image pretraining, reporting good performance on small models with limited data.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A new generative language model assigns each image pixel its own token to unify visual and textual inputs.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"9a1aa6f810241c07a415db3881ef09ee64679dd0c66a5d54bc9a9eefdefceb3e"},"source":{"id":"2605.14028","kind":"arxiv","version":1},"verdict":{"id":"c2a94eda-aad4-4f54-9bc9-017191295a9d","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T05:45:42.466348Z","strongest_claim":"The new model unifies pix token and word token into the generative language model... The experimental results show that it has good performance even in small model and with limited training data.","one_line_summary":"A new model unifies per-pixel and word tokens in a generative language model with per-pixel embeddings, color folding, and unsupervised image pretraining, reporting good performance on small models with limited data.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That assigning each pixel its own token embedding plus the added color folding and global conditional attention approximation will produce meaningfully better visual detail understanding than existing patch-based encoders, without any quantitative comparison or ablation shown in the provided text.","pith_extraction_headline":"A new generative language model assigns each image pixel its own token to unify visual and textual inputs."},"references":{"count":15,"sample":[{"doi":"","year":null,"title":"Neural Information Processing Systems , year=","work_id":"ddefce0d-5a65-4f2a-acdb-098979bde8a5","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale , author=. 2020 , journal=","work_id":"46d0f5e4-5977-4acc-a709-5a5872ad371a","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Learning Transferable Visual Models From Natural Language Supervision , author=. 2021 , eprint=","work_id":"52e6dbfc-97f5-4533-84f4-5802ac2d7e6d","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Sigmoid Loss for Language Image Pre-Training , author=. 2023 , eprint=","work_id":"82e7cb09-ab86-418a-9e80-106efecde008","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Visual Instruction Tuning , author=. 2023 , eprint=","work_id":"81b50bd6-b5a3-4943-9f30-19ea5bdfd626","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":15,"snapshot_sha256":"f34e5b26b52e212d8954048d0ed037a2af4e6b66f3538d43844b37549647d733","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"c2a94eda-aad4-4f54-9bc9-017191295a9d"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:12Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"J5gz2OnycDBxlDRDQUGjLFCh4Pd7czFA0wyL+OAuCHDFO205pF3tOrxruqfzCprBgWgaCRfRK8aMG9SFkujRDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T09:14:39.522728Z"},"content_sha256":"ff21a8eb034800e50a4c3be017bb90298a2ba8eb911e946e18612f98ae6d46fe","schema_version":"1.0","event_id":"sha256:ff21a8eb034800e50a4c3be017bb90298a2ba8eb911e946e18612f98ae6d46fe"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/RNLXT3JAGTMBVMZZMRC3AOGBQP/bundle.json","state_url":"https://pith.science/pith/RNLXT3JAGTMBVMZZMRC3AOGBQP/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/RNLXT3JAGTMBVMZZMRC3AOGBQP/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T09:14:39Z","links":{"resolver":"https://pith.science/pith/RNLXT3JAGTMBVMZZMRC3AOGBQP","bundle":"https://pith.science/pith/RNLXT3JAGTMBVMZZMRC3AOGBQP/bundle.json","state":"https://pith.science/pith/RNLXT3JAGTMBVMZZMRC3AOGBQP/state.json","well_known_bundle":"https://pith.science/.well-known/pith/RNLXT3JAGTMBVMZZMRC3AOGBQP/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:RNLXT3JAGTMBVMZZMRC3AOGBQP","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"603b2c502b29c231ff054044abb6165cf7addcfe829fdc9742523fdcb110dc9e","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T18:38:51Z","title_canon_sha256":"a07c99d9289337019c324237fce5d3d64bca7473b659b40301115ca0128b9263"},"schema_version":"1.0","source":{"id":"2605.14028","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14028","created_at":"2026-05-17T23:39:12Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14028v1","created_at":"2026-05-17T23:39:12Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14028","created_at":"2026-05-17T23:39:12Z"},{"alias_kind":"pith_short_12","alias_value":"RNLXT3JAGTMB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"RNLXT3JAGTMBVMZZ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"RNLXT3JA","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:ff21a8eb034800e50a4c3be017bb90298a2ba8eb911e946e18612f98ae6d46fe","target":"graph","created_at":"2026-05-17T23:39:12Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"The new model unifies pix token and word token into the generative language model... The experimental results show that it has good performance even in small model and with limited training data."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That assigning each pixel its own token embedding plus the added color folding and global conditional attention approximation will produce meaningfully better visual detail understanding than existing patch-based encoders, without any quantitative comparison or ablation shown in the provided text."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"A new model unifies per-pixel and word tokens in a generative language model with per-pixel embeddings, color folding, and unsupervised image pretraining, reporting good performance on small models with limited data."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A new generative language model assigns each image pixel its own token to unify visual and textual inputs."}],"snapshot_sha256":"9a1aa6f810241c07a415db3881ef09ee64679dd0c66a5d54bc9a9eefdefceb3e"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Since the emergence of Vision Transformer (ViT), it has been widely used in generative language model and generative visual model. Especially in the current state-of-art open source multimodal models, ViT obtained by CLIP or SigLIP method serves as the vision encoder backbone to help them acquire visual understanding capabilities. But this method leads to limitations in visual understanding for details, such as difficulty in recognizing small text or numbers in images. To address these issues, we propose a new model to unify pix token and word token into the generative language model. The new ","authors_text":"Haun Leung, Zinan Wang","cross_cats":[],"headline":"A new generative language model assigns each image pixel its own token to unify visual and textual inputs.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T18:38:51Z","title":"Unified Pix Token And Word Token Generative Language Model"},"references":{"count":15,"internal_anchors":0,"resolved_work":15,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Neural Information Processing Systems , year=","work_id":"ddefce0d-5a65-4f2a-acdb-098979bde8a5","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale , author=. 2020 , journal=","work_id":"46d0f5e4-5977-4acc-a709-5a5872ad371a","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Learning Transferable Visual Models From Natural Language Supervision , author=. 2021 , eprint=","work_id":"52e6dbfc-97f5-4533-84f4-5802ac2d7e6d","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Sigmoid Loss for Language Image Pre-Training , author=. 2023 , eprint=","work_id":"82e7cb09-ab86-418a-9e80-106efecde008","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Visual Instruction Tuning , author=. 2023 , eprint=","work_id":"81b50bd6-b5a3-4943-9f30-19ea5bdfd626","year":2023}],"snapshot_sha256":"f34e5b26b52e212d8954048d0ed037a2af4e6b66f3538d43844b37549647d733"},"source":{"id":"2605.14028","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T05:45:42.466348Z","id":"c2a94eda-aad4-4f54-9bc9-017191295a9d","model_set":{"reader":"grok-4.3"},"one_line_summary":"A new model unifies per-pixel and word tokens in a generative language model with per-pixel embeddings, color folding, and unsupervised image pretraining, reporting good performance on small models with limited data.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A new generative language model assigns each image pixel its own token to unify visual and textual inputs.","strongest_claim":"The new model unifies pix token and word token into the generative language model... The experimental results show that it has good performance even in small model and with limited training data.","weakest_assumption":"That assigning each pixel its own token embedding plus the added color folding and global conditional attention approximation will produce meaningfully better visual detail understanding than existing patch-based encoders, without any quantitative comparison or ablation shown in the provided text."}},"verdict_id":"c2a94eda-aad4-4f54-9bc9-017191295a9d"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:18f2f7708f6715ca8a5a5b2c253ae615051efd7eaa0c1f0e8b41d9f53afb0dbf","target":"record","created_at":"2026-05-17T23:39:12Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"603b2c502b29c231ff054044abb6165cf7addcfe829fdc9742523fdcb110dc9e","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T18:38:51Z","title_canon_sha256":"a07c99d9289337019c324237fce5d3d64bca7473b659b40301115ca0128b9263"},"schema_version":"1.0","source":{"id":"2605.14028","kind":"arxiv","version":1}},"canonical_sha256":"8b5779ed2034d81ab3396445b038c183e8e66da75bafbd54f17df70b94c91f3a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"8b5779ed2034d81ab3396445b038c183e8e66da75bafbd54f17df70b94c91f3a","first_computed_at":"2026-05-17T23:39:12.875309Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:12.875309Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"0zL5BpH2k9sETbacpE/Ynp4BZebYr15qZNhuVSfOcmV3hT/9K7Uv8miyc6a7hBq0C81RD5CT6aM/nmcqf/f7BA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:12.875869Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14028","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:18f2f7708f6715ca8a5a5b2c253ae615051efd7eaa0c1f0e8b41d9f53afb0dbf","sha256:ff21a8eb034800e50a4c3be017bb90298a2ba8eb911e946e18612f98ae6d46fe"],"state_sha256":"2a1f997045b6f4e5264a478410b3367d32bf3c7107432861f0fc26284aaf9592"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"nJKx2GPAm1mU1Hx0dv4r2Q5L1gJZXOEWOAdDVWSbkMQHGpccrpuxKFfZrOMkCsBR/bPZ1Q3GjaUVq6OF8VLrBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T09:14:39.527474Z","bundle_sha256":"a332f92c03dd7e36b4f9559f2c7892f3b451d5ac60a9decacc0e69fb8c190704"}}