{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:Y2U5SNI2RKE3OMJHFTWEOVSPEC","short_pith_number":"pith:Y2U5SNI2","schema_version":"1.0","canonical_sha256":"c6a9d9351a8a89b731272cec47564f20a5cdfa4c16d7ac3d9d1e8d4cf65a09ad","source":{"kind":"arxiv","id":"2510.00054","version":2},"attestation_state":"computed","paper":{"title":"HiDe: Rethinking The Zoom-IN method in High Resolution MLLMs via Hierarchical Decoupling","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Bo Zheng, Jian Xu, Liang Wu, Xianjie Liu, Yiman Hu, Yixiong Zou","submitted_at":"2025-09-28T08:31:48Z","abstract_excerpt":"Multimodal Large Language Models (MLLMs) have made significant strides in visual understanding tasks. However, their performance on high-resolution images remains suboptimal. While existing approaches often attribute this limitation to perceptual constraints and argue that MLLMs struggle to recognize small objects, leading them to use \"zoom in\" strategies for better detail, our analysis reveals a different cause: the main issue is not object size, but rather caused by complex background interference. We systematically analyze this \"zoom in\" operation through a series of decoupling experiments "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2510.00054","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-09-28T08:31:48Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"4ea2a29ca78cc03c419bd09616c31136bc8ddc62f0d04ce95f4a13460d80f062","abstract_canon_sha256":"aa081f654743edd5711f6340f8737d4f7237881250ce2432f98f088f30c5ff1c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T02:05:36.715261Z","signature_b64":"/jCjF7ovmOS5xPwQY8+zUyupu6Rha/g06o+dW0x1FBdxbVjbQGuA1Bsgky/RHE44OwN73nuy4pZ5VRgPGWkQAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c6a9d9351a8a89b731272cec47564f20a5cdfa4c16d7ac3d9d1e8d4cf65a09ad","last_reissued_at":"2026-05-20T02:05:36.714432Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T02:05:36.714432Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"HiDe: Rethinking The Zoom-IN method in High Resolution MLLMs via Hierarchical Decoupling","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Bo Zheng, Jian Xu, Liang Wu, Xianjie Liu, Yiman Hu, Yixiong Zou","submitted_at":"2025-09-28T08:31:48Z","abstract_excerpt":"Multimodal Large Language Models (MLLMs) have made significant strides in visual understanding tasks. However, their performance on high-resolution images remains suboptimal. While existing approaches often attribute this limitation to perceptual constraints and argue that MLLMs struggle to recognize small objects, leading them to use \"zoom in\" strategies for better detail, our analysis reveals a different cause: the main issue is not object size, but rather caused by complex background interference. We systematically analyze this \"zoom in\" operation through a series of decoupling experiments "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.00054","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2510.00054/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2510.00054","created_at":"2026-05-20T02:05:36.714567+00:00"},{"alias_kind":"arxiv_version","alias_value":"2510.00054v2","created_at":"2026-05-20T02:05:36.714567+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.00054","created_at":"2026-05-20T02:05:36.714567+00:00"},{"alias_kind":"pith_short_12","alias_value":"Y2U5SNI2RKE3","created_at":"2026-05-20T02:05:36.714567+00:00"},{"alias_kind":"pith_short_16","alias_value":"Y2U5SNI2RKE3OMJH","created_at":"2026-05-20T02:05:36.714567+00:00"},{"alias_kind":"pith_short_8","alias_value":"Y2U5SNI2","created_at":"2026-05-20T02:05:36.714567+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2605.18740","citing_title":"Vision-OPD: Learning to See Fine Details for Multimodal LLMs via On-Policy Self-Distillation","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06912","citing_title":"Q-Zoom: Query-Aware Adaptive Perception for Efficient Multimodal Large Language Models","ref_index":53,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Y2U5SNI2RKE3OMJHFTWEOVSPEC","json":"https://pith.science/pith/Y2U5SNI2RKE3OMJHFTWEOVSPEC.json","graph_json":"https://pith.science/api/pith-number/Y2U5SNI2RKE3OMJHFTWEOVSPEC/graph.json","events_json":"https://pith.science/api/pith-number/Y2U5SNI2RKE3OMJHFTWEOVSPEC/events.json","paper":"https://pith.science/paper/Y2U5SNI2"},"agent_actions":{"view_html":"https://pith.science/pith/Y2U5SNI2RKE3OMJHFTWEOVSPEC","download_json":"https://pith.science/pith/Y2U5SNI2RKE3OMJHFTWEOVSPEC.json","view_paper":"https://pith.science/paper/Y2U5SNI2","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2510.00054&json=true","fetch_graph":"https://pith.science/api/pith-number/Y2U5SNI2RKE3OMJHFTWEOVSPEC/graph.json","fetch_events":"https://pith.science/api/pith-number/Y2U5SNI2RKE3OMJHFTWEOVSPEC/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Y2U5SNI2RKE3OMJHFTWEOVSPEC/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Y2U5SNI2RKE3OMJHFTWEOVSPEC/action/storage_attestation","attest_author":"https://pith.science/pith/Y2U5SNI2RKE3OMJHFTWEOVSPEC/action/author_attestation","sign_citation":"https://pith.science/pith/Y2U5SNI2RKE3OMJHFTWEOVSPEC/action/citation_signature","submit_replication":"https://pith.science/pith/Y2U5SNI2RKE3OMJHFTWEOVSPEC/action/replication_record"}},"created_at":"2026-05-20T02:05:36.714567+00:00","updated_at":"2026-05-20T02:05:36.714567+00:00"}