{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:VITK3VBU5MEPLZ4MOALEG5VYZ6","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"d5c091666b3b7ee6e36740e3f46ddaba83dc06f3c49fa818841cd1f10fe06639","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-03-13T17:56:05Z","title_canon_sha256":"a834e22b8a0f74a52fd27e91bfd289fbbba20a0f8d0cae4148e740dde2f22644"},"schema_version":"1.0","source":{"id":"2503.10615","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2503.10615","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2503.10615v2","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2503.10615","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"VITK3VBU5MEP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"VITK3VBU5MEPLZ4M","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"VITK3VBU","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:d20a927710aec6c01433bb5d2be00913b2590c0bdc63314ad35a9fb1d61b8a3b","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experimental results show that R1-Onevision achieves state-of-the-art performance, outperforming models such as GPT-4o and Qwen2.5-VL on multiple challenging multimodal reasoning benchmarks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The cross-modal reasoning pipeline that transforms images into formal textual representations enables precise language-based reasoning without loss of critical visual information."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"R1-Onevision turns images into structured text for multimodal reasoning, trains on a custom dataset with RL, and claims SOTA results on an educational benchmark."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Converting images to formal textual representations lets a new model reason more precisely about visual content and outperform GPT-4o on multimodal benchmarks."}],"snapshot_sha256":"328bf0673f857cc04372ad3144a7057d20744b2d37143ae14c47bd62ad9a0d3d"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"da588d0e4ac644c4f37b168acb8e544ce1d95322a0b2f1d6712432f995874f73"},"paper":{"abstract_excerpt":"Large Language Models have demonstrated remarkable reasoning capability in complex textual tasks. However, multimodal reasoning, which requires integrating visual and textual information, remains a significant challenge. Existing visual-language models often struggle to effectively analyze and reason visual content, resulting in suboptimal performance on complex reasoning tasks. Moreover, the absence of comprehensive benchmarks hinders the accurate assessment of multimodal reasoning capabilities. In this paper, we introduce R1-Onevision, a multimodal reasoning model designed to bridge the gap ","authors_text":"Bo Zhang, Dacheng Yin, Fengyun Rao, Haoyu Lu, Hongkun Pan, Minfeng Zhu, Wei Chen, Xiaoxuan He, Xingtao Yang, Xiyan Jiang, Yan Deng, Yi Yang","cross_cats":[],"headline":"Converting images to formal textual representations lets a new model reason more precisely about visual content and outperform GPT-4o on multimodal benchmarks.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-03-13T17:56:05Z","title":"R1-Onevision: Advancing Generalized Multimodal Reasoning through Cross-Modal Formalization"},"references":{"count":52,"internal_anchors":12,"resolved_work":52,"sample":[{"cited_arxiv_id":"2303.08774","doi":"","is_internal_anchor":true,"ref_index":1,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Large language models for mathematical reasoning: Progresses and challenges","work_id":"d8278850-469f-40f1-8af5-cf634483852f","year":2024},{"cited_arxiv_id":"2502.13923","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","year":2025},{"cited_arxiv_id":"2107.03374","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","year":2021},{"cited_arxiv_id":"2412.05271","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","work_id":"ee70bdc8-4656-4849-ada7-ce42a2278d70","year":2024}],"snapshot_sha256":"37c05a6301528103c18f297a800da92de427d48f1dfb8a2c911a07254025246c"},"source":{"id":"2503.10615","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T00:14:57.086487Z","id":"5b1d824b-5318-486b-bc22-b1dab766e6b3","model_set":{"reader":"grok-4.3"},"one_line_summary":"R1-Onevision turns images into structured text for multimodal reasoning, trains on a custom dataset with RL, and claims SOTA results on an educational benchmark.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Converting images to formal textual representations lets a new model reason more precisely about visual content and outperform GPT-4o on multimodal benchmarks.","strongest_claim":"Experimental results show that R1-Onevision achieves state-of-the-art performance, outperforming models such as GPT-4o and Qwen2.5-VL on multiple challenging multimodal reasoning benchmarks.","weakest_assumption":"The cross-modal reasoning pipeline that transforms images into formal textual representations enables precise language-based reasoning without loss of critical visual information."}},"verdict_id":"5b1d824b-5318-486b-bc22-b1dab766e6b3"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:342016d6a9914ed32250ca87da6216c5c90b9fe063adfd298735176f629fd9fd","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"d5c091666b3b7ee6e36740e3f46ddaba83dc06f3c49fa818841cd1f10fe06639","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-03-13T17:56:05Z","title_canon_sha256":"a834e22b8a0f74a52fd27e91bfd289fbbba20a0f8d0cae4148e740dde2f22644"},"schema_version":"1.0","source":{"id":"2503.10615","kind":"arxiv","version":2}},"canonical_sha256":"aa26add434eb08f5e78c70164376b8cfb7ad667f77886f2511bf8d2db77f60c0","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"aa26add434eb08f5e78c70164376b8cfb7ad667f77886f2511bf8d2db77f60c0","first_computed_at":"2026-05-17T23:38:49.635056Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.635056Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"72qviIJB96ShGM5ItxbBRdgsMBvj7Rc9qoTNyRg9hHxCkteqzHYXnIxqDDcPSyIa1SrQx6Ms2/EaMFRpry0IDQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.635538Z","signed_message":"canonical_sha256_bytes"},"source_id":"2503.10615","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:342016d6a9914ed32250ca87da6216c5c90b9fe063adfd298735176f629fd9fd","sha256:d20a927710aec6c01433bb5d2be00913b2590c0bdc63314ad35a9fb1d61b8a3b"],"state_sha256":"233118b97aeec32ef5649859723fb2583972e506cf9a7b375f048f9acc8046e0"}