{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2021:ELRZAFKNEJAHGXLGVCE5VFMBZI","short_pith_number":"pith:ELRZAFKN","schema_version":"1.0","canonical_sha256":"22e390154d2240735d66a889da9581ca0deab5a146b70db15af1f7f11686887c","source":{"kind":"arxiv","id":"2110.02178","version":2},"attestation_state":"computed","paper":{"title":"MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CV","authors_text":"Mohammad Rastegari, Sachin Mehta","submitted_at":"2021-10-05T17:07:53Z","abstract_excerpt":"Light-weight convolutional neural networks (CNNs) are the de-facto for mobile vision tasks. Their spatial inductive biases allow them to learn representations with fewer parameters across different vision tasks. However, these networks are spatially local. To learn global representations, self-attention-based vision trans-formers (ViTs) have been adopted. Unlike CNNs, ViTs are heavy-weight. In this paper, we ask the following question: is it possible to combine the strengths of CNNs and ViTs to build a light-weight and low latency network for mobile vision tasks? Towards this end, we introduce"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2110.02178","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2021-10-05T17:07:53Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"5dbc9e592e87be75d16342427694e7798726460eb9c431dbea41aa4ceecea0d4","abstract_canon_sha256":"d29ca47d3f6899be06c684b7be7b8546ec412de603ddadd370a9bb2617ec876e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T20:40:06.252260Z","signature_b64":"ZFtK37LkpGWlJMLKBdXFIKl97DaYp6Bl2a3aww8cdhmdLWWeYwCmrCnQqJuZmTNbzuzkHPEBICqRTobDaWUKBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"22e390154d2240735d66a889da9581ca0deab5a146b70db15af1f7f11686887c","last_reissued_at":"2026-05-20T20:40:06.250278Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T20:40:06.250278Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CV","authors_text":"Mohammad Rastegari, Sachin Mehta","submitted_at":"2021-10-05T17:07:53Z","abstract_excerpt":"Light-weight convolutional neural networks (CNNs) are the de-facto for mobile vision tasks. Their spatial inductive biases allow them to learn representations with fewer parameters across different vision tasks. However, these networks are spatially local. To learn global representations, self-attention-based vision trans-formers (ViTs) have been adopted. Unlike CNNs, ViTs are heavy-weight. In this paper, we ask the following question: is it possible to combine the strengths of CNNs and ViTs to build a light-weight and low latency network for mobile vision tasks? Towards this end, we introduce"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2110.02178","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2110.02178/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2110.02178","created_at":"2026-05-20T20:40:06.250385+00:00"},{"alias_kind":"arxiv_version","alias_value":"2110.02178v2","created_at":"2026-05-20T20:40:06.250385+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2110.02178","created_at":"2026-05-20T20:40:06.250385+00:00"},{"alias_kind":"pith_short_12","alias_value":"ELRZAFKNEJAH","created_at":"2026-05-20T20:40:06.250385+00:00"},{"alias_kind":"pith_short_16","alias_value":"ELRZAFKNEJAHGXLG","created_at":"2026-05-20T20:40:06.250385+00:00"},{"alias_kind":"pith_short_8","alias_value":"ELRZAFKN","created_at":"2026-05-20T20:40:06.250385+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":21,"internal_anchor_count":21,"sample":[{"citing_arxiv_id":"2605.16401","citing_title":"CADS: Conformal Adaptive Decision System for Cost-Efficient Image Classification","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15551","citing_title":"Characterizing Learning in Deep Neural Networks using Tractable Algorithmic Complexity Analysis","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19478","citing_title":"Exposing Functional Fusion: A New Class of Strategic Backdoor in Dynamic Prompt Architectures","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2507.05193","citing_title":"RAM-W600: A Multi-Task Wrist Dataset and Benchmark for Rheumatoid Arthritis","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2507.23315","citing_title":"Analysis of Hyperparameter Optimization Effects on Lightweight Deep Models for Real-Time Image Classification","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2510.19239","citing_title":"TinyUSFM: Towards Compact and Efficient Ultrasound Foundation Models","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2306.14289","citing_title":"Faster Segment Anything: Towards Lightweight SAM for Mobile Applications","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2603.15941","citing_title":"Towards Fair and Robust Volumetric CT Classification via KL-Regularised Group Distributionally Robust Optimisation","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14110","citing_title":"SToRe3D: Sparse Token Relevance in ViTs for Efficient Multi-View 3D Object Detection","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11563","citing_title":"TCP-SSM: Efficient Vision State Space Models with Token-Conditioned Poles","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24810","citing_title":"A Comparative Analysis on the Performance of Upper Confidence Bound Algorithms in Adaptive Deep Neural Networks","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23728","citing_title":"ESIA: An Energy-Based Spatiotemporal Interaction-Aware Framework for Pedestrian Intention Prediction","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23387","citing_title":"Keypoint-based Dynamic Object 6-DoF Pose Tracking via Event Camera","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23320","citing_title":"KAConvNet: Kolmogorov-Arnold Convolutional Networks for Vision Recognition","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23137","citing_title":"CNN-ViT Fusion with Adaptive Attention Gate for Brain Tumor MRI Classification: A Hybrid Deep Learning Model","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05616","citing_title":"RAM-H1200: A Unified Evaluation and Dataset on Hand Radiographs for Rheumatoid Arthritis","ref_index":41,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14630","citing_title":"CMTM: Cross-Modal Token Modulation for Unsupervised Video Object Segmentation","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06714","citing_title":"Edge Deep Learning in Computer Vision and Medical Diagnostics: A Comprehensive Survey","ref_index":206,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07023","citing_title":"OneViewAll: Semantic Prior Guided One-View 6D Pose Estimation for Novel Objects","ref_index":60,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05431","citing_title":"Cross-Stage Attention Propagation for Efficient Semantic Segmentation","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2604.16555","citing_title":"LLM as a Tool, Not an Agent: Code-Mined Tree Transformations for Neural Architecture Search","ref_index":36,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ELRZAFKNEJAHGXLGVCE5VFMBZI","json":"https://pith.science/pith/ELRZAFKNEJAHGXLGVCE5VFMBZI.json","graph_json":"https://pith.science/api/pith-number/ELRZAFKNEJAHGXLGVCE5VFMBZI/graph.json","events_json":"https://pith.science/api/pith-number/ELRZAFKNEJAHGXLGVCE5VFMBZI/events.json","paper":"https://pith.science/paper/ELRZAFKN"},"agent_actions":{"view_html":"https://pith.science/pith/ELRZAFKNEJAHGXLGVCE5VFMBZI","download_json":"https://pith.science/pith/ELRZAFKNEJAHGXLGVCE5VFMBZI.json","view_paper":"https://pith.science/paper/ELRZAFKN","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2110.02178&json=true","fetch_graph":"https://pith.science/api/pith-number/ELRZAFKNEJAHGXLGVCE5VFMBZI/graph.json","fetch_events":"https://pith.science/api/pith-number/ELRZAFKNEJAHGXLGVCE5VFMBZI/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ELRZAFKNEJAHGXLGVCE5VFMBZI/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ELRZAFKNEJAHGXLGVCE5VFMBZI/action/storage_attestation","attest_author":"https://pith.science/pith/ELRZAFKNEJAHGXLGVCE5VFMBZI/action/author_attestation","sign_citation":"https://pith.science/pith/ELRZAFKNEJAHGXLGVCE5VFMBZI/action/citation_signature","submit_replication":"https://pith.science/pith/ELRZAFKNEJAHGXLGVCE5VFMBZI/action/replication_record"}},"created_at":"2026-05-20T20:40:06.250385+00:00","updated_at":"2026-05-20T20:40:06.250385+00:00"}