{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:WP3BBLDLKSDFOXPGWIHW7UNOJR","short_pith_number":"pith:WP3BBLDL","canonical_record":{"source":{"id":"2602.22918","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-02-26T12:06:02Z","cross_cats_sorted":[],"title_canon_sha256":"0e39d9d87f55beb0bb2e47a6db5c08075fbd593200c20273a65c7ec7d32c6516","abstract_canon_sha256":"a7dc2563348038b7712030ef79d94e915b77d304f57d01eb0c8c5a525ff2c84c"},"schema_version":"1.0"},"canonical_sha256":"b3f610ac6b5486575de6b20f6fd1ae4c4e9565b6e22d4dd46c45961fd0fddbd4","source":{"kind":"arxiv","id":"2602.22918","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.22918","created_at":"2026-05-20T00:00:35Z"},{"alias_kind":"arxiv_version","alias_value":"2602.22918v3","created_at":"2026-05-20T00:00:35Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.22918","created_at":"2026-05-20T00:00:35Z"},{"alias_kind":"pith_short_12","alias_value":"WP3BBLDLKSDF","created_at":"2026-05-20T00:00:35Z"},{"alias_kind":"pith_short_16","alias_value":"WP3BBLDLKSDFOXPG","created_at":"2026-05-20T00:00:35Z"},{"alias_kind":"pith_short_8","alias_value":"WP3BBLDL","created_at":"2026-05-20T00:00:35Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:WP3BBLDLKSDFOXPGWIHW7UNOJR","target":"record","payload":{"canonical_record":{"source":{"id":"2602.22918","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-02-26T12:06:02Z","cross_cats_sorted":[],"title_canon_sha256":"0e39d9d87f55beb0bb2e47a6db5c08075fbd593200c20273a65c7ec7d32c6516","abstract_canon_sha256":"a7dc2563348038b7712030ef79d94e915b77d304f57d01eb0c8c5a525ff2c84c"},"schema_version":"1.0"},"canonical_sha256":"b3f610ac6b5486575de6b20f6fd1ae4c4e9565b6e22d4dd46c45961fd0fddbd4","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:00:35.209616Z","signature_b64":"bA4XOgKNqcG0B5yRE8h5Tzr2VQvJRmoXIWE8nBEDMKjfktxtwsFEQR1SoUrYC1qM6GMNCyhScSxh/9ZK2+GRDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b3f610ac6b5486575de6b20f6fd1ae4c4e9565b6e22d4dd46c45961fd0fddbd4","last_reissued_at":"2026-05-20T00:00:35.208847Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:00:35.208847Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.22918","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:00:35Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"PbtrYo7Oy7v/8qETrrFHr49wcr7qzPfZTWhpK0HmpL3TuyyHLMZi8BGfhlQqdFMFAlL/p6gS3CoudT+KcPlNBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-04T01:25:24.539947Z"},"content_sha256":"49fb3b7cd2a91eaf380797c346df46d6276cf60f99e3d1ef8cc2d56fe47909ed","schema_version":"1.0","event_id":"sha256:49fb3b7cd2a91eaf380797c346df46d6276cf60f99e3d1ef8cc2d56fe47909ed"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:WP3BBLDLKSDFOXPGWIHW7UNOJR","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Where Vision Becomes Text: Locating the OCR Routing Bottleneck in Vision-Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"OCR information routes into vision-language models at architecture-specific layers, forming a low-dimensional signal that transfers across datasets.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Jonathan Steinberg, Oren Gal","submitted_at":"2026-02-26T12:06:02Z","abstract_excerpt":"Vision-language models (VLMs) can read text from images, but where does this optical character recognition (OCR) information enter the language processing stream? We investigate the OCR routing mechanism across three architecture families (Qwen3-VL, Phi-4, InternVL3.5) using causal interventions. By computing activation differences between original images and text-inpainted versions, we identify architecture-specific OCR bottlenecks whose dominant location depends on the vision-language integration strategy: DeepStack models (Qwen) show peak sensitivity at mid-depth (about 50%) for scene text,"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"The OCR signal is remarkably low-dimensional: PC1 captures 72.9% of variance. Crucially, principal component analysis (PCA) directions learned on one dataset transfer to others, demonstrating shared text-processing pathways. Surprisingly, in models with modular OCR circuits (notably Qwen3-VL-4B), OCR removal can improve counting performance (up to +6.9 percentage points).","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That inpainting text in images only affects the OCR pathway without introducing other unintended changes to the visual input that could confound the activation differences.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Causal interventions identify architecture-specific OCR bottlenecks in VLMs at mid or early layers, with low-dimensional shared pathways and potential performance benefits from OCR removal.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"OCR information routes into vision-language models at architecture-specific layers, forming a low-dimensional signal that transfers across datasets.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"be509f75d441e45c7701a833f09421ce287bd43eb7ca9c2305df735ace667953"},"source":{"id":"2602.22918","kind":"arxiv","version":3},"verdict":{"id":"72671c9f-5b31-471a-9f3a-031a086bf24c","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T18:48:50.193091Z","strongest_claim":"The OCR signal is remarkably low-dimensional: PC1 captures 72.9% of variance. Crucially, principal component analysis (PCA) directions learned on one dataset transfer to others, demonstrating shared text-processing pathways. Surprisingly, in models with modular OCR circuits (notably Qwen3-VL-4B), OCR removal can improve counting performance (up to +6.9 percentage points).","one_line_summary":"Causal interventions identify architecture-specific OCR bottlenecks in VLMs at mid or early layers, with low-dimensional shared pathways and potential performance benefits from OCR removal.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That inpainting text in images only affects the OCR pathway without introducing other unintended changes to the visual input that could confound the activation differences.","pith_extraction_headline":"OCR information routes into vision-language models at architecture-specific layers, forming a low-dimensional signal that transfers across datasets."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.22918/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"b59abfa51f9506f64efc133bac7adf3dee799cbd2d812106f10886df9bf3b4c6"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"72671c9f-5b31-471a-9f3a-031a086bf24c"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:00:35Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"LDf6y585TE2zaKktvbaqnZuFmj6X5xyaThZKVSvrP1UNuIy+O1xfjJojUqZUWdwrfT21rSZjL1UeIlfhs74RCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-04T01:25:24.540446Z"},"content_sha256":"4f478e7682e281d72fce7ac875dacf2052ca7a4df64d7d4d4f90baa3c19c74ca","schema_version":"1.0","event_id":"sha256:4f478e7682e281d72fce7ac875dacf2052ca7a4df64d7d4d4f90baa3c19c74ca"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/WP3BBLDLKSDFOXPGWIHW7UNOJR/bundle.json","state_url":"https://pith.science/pith/WP3BBLDLKSDFOXPGWIHW7UNOJR/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/WP3BBLDLKSDFOXPGWIHW7UNOJR/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-04T01:25:24Z","links":{"resolver":"https://pith.science/pith/WP3BBLDLKSDFOXPGWIHW7UNOJR","bundle":"https://pith.science/pith/WP3BBLDLKSDFOXPGWIHW7UNOJR/bundle.json","state":"https://pith.science/pith/WP3BBLDLKSDFOXPGWIHW7UNOJR/state.json","well_known_bundle":"https://pith.science/.well-known/pith/WP3BBLDLKSDFOXPGWIHW7UNOJR/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:WP3BBLDLKSDFOXPGWIHW7UNOJR","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"a7dc2563348038b7712030ef79d94e915b77d304f57d01eb0c8c5a525ff2c84c","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-02-26T12:06:02Z","title_canon_sha256":"0e39d9d87f55beb0bb2e47a6db5c08075fbd593200c20273a65c7ec7d32c6516"},"schema_version":"1.0","source":{"id":"2602.22918","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.22918","created_at":"2026-05-20T00:00:35Z"},{"alias_kind":"arxiv_version","alias_value":"2602.22918v3","created_at":"2026-05-20T00:00:35Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.22918","created_at":"2026-05-20T00:00:35Z"},{"alias_kind":"pith_short_12","alias_value":"WP3BBLDLKSDF","created_at":"2026-05-20T00:00:35Z"},{"alias_kind":"pith_short_16","alias_value":"WP3BBLDLKSDFOXPG","created_at":"2026-05-20T00:00:35Z"},{"alias_kind":"pith_short_8","alias_value":"WP3BBLDL","created_at":"2026-05-20T00:00:35Z"}],"graph_snapshots":[{"event_id":"sha256:4f478e7682e281d72fce7ac875dacf2052ca7a4df64d7d4d4f90baa3c19c74ca","target":"graph","created_at":"2026-05-20T00:00:35Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"The OCR signal is remarkably low-dimensional: PC1 captures 72.9% of variance. Crucially, principal component analysis (PCA) directions learned on one dataset transfer to others, demonstrating shared text-processing pathways. Surprisingly, in models with modular OCR circuits (notably Qwen3-VL-4B), OCR removal can improve counting performance (up to +6.9 percentage points)."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That inpainting text in images only affects the OCR pathway without introducing other unintended changes to the visual input that could confound the activation differences."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Causal interventions identify architecture-specific OCR bottlenecks in VLMs at mid or early layers, with low-dimensional shared pathways and potential performance benefits from OCR removal."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"OCR information routes into vision-language models at architecture-specific layers, forming a low-dimensional signal that transfers across datasets."}],"snapshot_sha256":"be509f75d441e45c7701a833f09421ce287bd43eb7ca9c2305df735ace667953"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"b59abfa51f9506f64efc133bac7adf3dee799cbd2d812106f10886df9bf3b4c6"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2602.22918/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Vision-language models (VLMs) can read text from images, but where does this optical character recognition (OCR) information enter the language processing stream? We investigate the OCR routing mechanism across three architecture families (Qwen3-VL, Phi-4, InternVL3.5) using causal interventions. By computing activation differences between original images and text-inpainted versions, we identify architecture-specific OCR bottlenecks whose dominant location depends on the vision-language integration strategy: DeepStack models (Qwen) show peak sensitivity at mid-depth (about 50%) for scene text,","authors_text":"Jonathan Steinberg, Oren Gal","cross_cats":[],"headline":"OCR information routes into vision-language models at architecture-specific layers, forming a low-dimensional signal that transfers across datasets.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-02-26T12:06:02Z","title":"Where Vision Becomes Text: Locating the OCR Routing Bottleneck in Vision-Language Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.22918","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-15T18:48:50.193091Z","id":"72671c9f-5b31-471a-9f3a-031a086bf24c","model_set":{"reader":"grok-4.3"},"one_line_summary":"Causal interventions identify architecture-specific OCR bottlenecks in VLMs at mid or early layers, with low-dimensional shared pathways and potential performance benefits from OCR removal.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"OCR information routes into vision-language models at architecture-specific layers, forming a low-dimensional signal that transfers across datasets.","strongest_claim":"The OCR signal is remarkably low-dimensional: PC1 captures 72.9% of variance. Crucially, principal component analysis (PCA) directions learned on one dataset transfer to others, demonstrating shared text-processing pathways. Surprisingly, in models with modular OCR circuits (notably Qwen3-VL-4B), OCR removal can improve counting performance (up to +6.9 percentage points).","weakest_assumption":"That inpainting text in images only affects the OCR pathway without introducing other unintended changes to the visual input that could confound the activation differences."}},"verdict_id":"72671c9f-5b31-471a-9f3a-031a086bf24c"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:49fb3b7cd2a91eaf380797c346df46d6276cf60f99e3d1ef8cc2d56fe47909ed","target":"record","created_at":"2026-05-20T00:00:35Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"a7dc2563348038b7712030ef79d94e915b77d304f57d01eb0c8c5a525ff2c84c","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-02-26T12:06:02Z","title_canon_sha256":"0e39d9d87f55beb0bb2e47a6db5c08075fbd593200c20273a65c7ec7d32c6516"},"schema_version":"1.0","source":{"id":"2602.22918","kind":"arxiv","version":3}},"canonical_sha256":"b3f610ac6b5486575de6b20f6fd1ae4c4e9565b6e22d4dd46c45961fd0fddbd4","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b3f610ac6b5486575de6b20f6fd1ae4c4e9565b6e22d4dd46c45961fd0fddbd4","first_computed_at":"2026-05-20T00:00:35.208847Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:00:35.208847Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"bA4XOgKNqcG0B5yRE8h5Tzr2VQvJRmoXIWE8nBEDMKjfktxtwsFEQR1SoUrYC1qM6GMNCyhScSxh/9ZK2+GRDw==","signature_status":"signed_v1","signed_at":"2026-05-20T00:00:35.209616Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.22918","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:49fb3b7cd2a91eaf380797c346df46d6276cf60f99e3d1ef8cc2d56fe47909ed","sha256:4f478e7682e281d72fce7ac875dacf2052ca7a4df64d7d4d4f90baa3c19c74ca"],"state_sha256":"98285dfeb2c5298087e1fbe8fe3fe1ada5762bc0d9c1d227feeee4d3efcba949"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"2hjoxUzO8FY+HHfELGJlhi8NpnMabCwe2I6YpObzouln1KY5pKOLr1I6350cY9xsFCfgGy4l1pni8GZ3T96MCg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-04T01:25:24.542767Z","bundle_sha256":"412e640dca6ba5b942eb13a1c947d83f39a1edaeb603a52a5d42622179580f37"}}