{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:VNAKPWUDD2NBXGSRU7DCSFJ5GV","short_pith_number":"pith:VNAKPWUD","schema_version":"1.0","canonical_sha256":"ab40a7da831e9a1b9a51a7c629153d355484953ba973aa4e421ce8144bf400e9","source":{"kind":"arxiv","id":"2505.21457","version":2},"attestation_state":"computed","paper":{"title":"ACTIVE-o3: Empowering MLLMs with Active Perception via Pure Reinforcement Learning","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Anzhou Li, Canyu Zhao, Cheng Zou, Chunhua Shen, Hao Chen, Hao Zhong, Jingdong Chen, Ming Yang, Mingyu Liu, Muzhi Zhu, Zheng Huang, Zongze Du","submitted_at":"2025-05-27T17:29:31Z","abstract_excerpt":"Active vision, also known as active perception, refers to actively selecting where and how to look in order to gather task-relevant information. It is a critical component of efficient perception and decision-making in humans and advanced embodied agents. With the rise of Multimodal Large Language Models (MLLMs) as central planners in robotic systems, the lack of methods for equipping MLLMs with active perception has become a key gap. We first provide a systematic definition of MLLM-based active perception tasks and show that GPT-o3's zoom-in strategy can be viewed as a special case, though it"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2505.21457","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CV","submitted_at":"2025-05-27T17:29:31Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"120b60722cd368165684ab7524158d64648b9a2bcefe8e30268bdc1a6f1e97f1","abstract_canon_sha256":"fc72a0d8408b9f347ab9ba3b34961216b8dcd4603c28bc1f84eb88abc095c489"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T02:08:30.841215Z","signature_b64":"laHUYYBXx8PHS+r3K9ia8nEG3weqZcFnED2iBXhtB6JE0uoOEkc/Lo3k08T5oH1aOjgWRtYYOghp3EmN2z75Bg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ab40a7da831e9a1b9a51a7c629153d355484953ba973aa4e421ce8144bf400e9","last_reissued_at":"2026-06-09T02:08:30.840013Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T02:08:30.840013Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"ACTIVE-o3: Empowering MLLMs with Active Perception via Pure Reinforcement Learning","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Anzhou Li, Canyu Zhao, Cheng Zou, Chunhua Shen, Hao Chen, Hao Zhong, Jingdong Chen, Ming Yang, Mingyu Liu, Muzhi Zhu, Zheng Huang, Zongze Du","submitted_at":"2025-05-27T17:29:31Z","abstract_excerpt":"Active vision, also known as active perception, refers to actively selecting where and how to look in order to gather task-relevant information. It is a critical component of efficient perception and decision-making in humans and advanced embodied agents. With the rise of Multimodal Large Language Models (MLLMs) as central planners in robotic systems, the lack of methods for equipping MLLMs with active perception has become a key gap. We first provide a systematic definition of MLLM-based active perception tasks and show that GPT-o3's zoom-in strategy can be viewed as a special case, though it"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2505.21457","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2505.21457/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2505.21457","created_at":"2026-06-09T02:08:30.840204+00:00"},{"alias_kind":"arxiv_version","alias_value":"2505.21457v2","created_at":"2026-06-09T02:08:30.840204+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2505.21457","created_at":"2026-06-09T02:08:30.840204+00:00"},{"alias_kind":"pith_short_12","alias_value":"VNAKPWUDD2NB","created_at":"2026-06-09T02:08:30.840204+00:00"},{"alias_kind":"pith_short_16","alias_value":"VNAKPWUDD2NBXGSR","created_at":"2026-06-09T02:08:30.840204+00:00"},{"alias_kind":"pith_short_8","alias_value":"VNAKPWUD","created_at":"2026-06-09T02:08:30.840204+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2605.19538","citing_title":"CaptchaMind: Training CAPTCHA Solvers via Reinforcement Learning with Explicit Reasoning Supervision","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15542","citing_title":"DRS-GUI: Dynamic Region Search for Training-Free GUI Grounding","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2507.06448","citing_title":"Perception-Aware Policy Optimization for Multimodal Reasoning","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2509.07969","citing_title":"Mini-o3: Scaling Up Reasoning Patterns and Interaction Turns for Visual Search","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2511.19972","citing_title":"Boosting Reasoning in Large Multimodal Models via Activation Replay","ref_index":67,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06777","citing_title":"Walk the Talk: Bridging the Reasoning-Action Gap for Thinking with Images via Multimodal Agentic Policy Optimization","ref_index":66,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VNAKPWUDD2NBXGSRU7DCSFJ5GV","json":"https://pith.science/pith/VNAKPWUDD2NBXGSRU7DCSFJ5GV.json","graph_json":"https://pith.science/api/pith-number/VNAKPWUDD2NBXGSRU7DCSFJ5GV/graph.json","events_json":"https://pith.science/api/pith-number/VNAKPWUDD2NBXGSRU7DCSFJ5GV/events.json","paper":"https://pith.science/paper/VNAKPWUD"},"agent_actions":{"view_html":"https://pith.science/pith/VNAKPWUDD2NBXGSRU7DCSFJ5GV","download_json":"https://pith.science/pith/VNAKPWUDD2NBXGSRU7DCSFJ5GV.json","view_paper":"https://pith.science/paper/VNAKPWUD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2505.21457&json=true","fetch_graph":"https://pith.science/api/pith-number/VNAKPWUDD2NBXGSRU7DCSFJ5GV/graph.json","fetch_events":"https://pith.science/api/pith-number/VNAKPWUDD2NBXGSRU7DCSFJ5GV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VNAKPWUDD2NBXGSRU7DCSFJ5GV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VNAKPWUDD2NBXGSRU7DCSFJ5GV/action/storage_attestation","attest_author":"https://pith.science/pith/VNAKPWUDD2NBXGSRU7DCSFJ5GV/action/author_attestation","sign_citation":"https://pith.science/pith/VNAKPWUDD2NBXGSRU7DCSFJ5GV/action/citation_signature","submit_replication":"https://pith.science/pith/VNAKPWUDD2NBXGSRU7DCSFJ5GV/action/replication_record"}},"created_at":"2026-06-09T02:08:30.840204+00:00","updated_at":"2026-06-09T02:08:30.840204+00:00"}