{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:Q5KELL7N4N5XLKAFZPGHKOSAP4","short_pith_number":"pith:Q5KELL7N","schema_version":"1.0","canonical_sha256":"875445afede37b75a805cbcc753a407f033bf2900a988265c747fdad578cd085","source":{"kind":"arxiv","id":"2512.10310","version":2},"attestation_state":"computed","paper":{"title":"Efficient-VLN: A Simple yet Strong Baseline for Efficient Vision-Language Navigation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Duo Zheng, Liwei Wang, Shijia Huang, Yanyang Li","submitted_at":"2025-12-11T05:57:48Z","abstract_excerpt":"While Multimodal Large Language Models (MLLMs) have demonstrated significant promise in Vision-Language Navigation (VLN), existing agents remain heavily constrained by systemic bottlenecks across inference, training, and data collection. Specifically, they suffer from prohibitive latency due to visual history reprocessing, action leakage during sequence-packed training, and suboptimal exploration in self-correction data collection. To overcome these intertwined challenges, we present Efficient-VLN, a highly efficient and robust baseline that systematically resolves these issues through three s"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2512.10310","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-12-11T05:57:48Z","cross_cats_sorted":[],"title_canon_sha256":"9ce2646e7f4f3c46db250d44bc1f60b3a3489e04b601677294ba9bec975f47bf","abstract_canon_sha256":"a64b6dbc2a704791fbc829bf6cb63904cabe71223598a851aa4b937336370bcf"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T02:17:12.867400Z","signature_b64":"LnmPXGwcE2zOh9oGS1/MQWjIF1I4KJgXGzSNX9FR5sPTMbVwFql8PEcCnk97SpDJgyhbs8RTcjzkmHLZaA/lBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"875445afede37b75a805cbcc753a407f033bf2900a988265c747fdad578cd085","last_reissued_at":"2026-06-30T02:17:12.866610Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T02:17:12.866610Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Efficient-VLN: A Simple yet Strong Baseline for Efficient Vision-Language Navigation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Duo Zheng, Liwei Wang, Shijia Huang, Yanyang Li","submitted_at":"2025-12-11T05:57:48Z","abstract_excerpt":"While Multimodal Large Language Models (MLLMs) have demonstrated significant promise in Vision-Language Navigation (VLN), existing agents remain heavily constrained by systemic bottlenecks across inference, training, and data collection. Specifically, they suffer from prohibitive latency due to visual history reprocessing, action leakage during sequence-packed training, and suboptimal exploration in self-correction data collection. To overcome these intertwined challenges, we present Efficient-VLN, a highly efficient and robust baseline that systematically resolves these issues through three s"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2512.10310","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2512.10310/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2512.10310","created_at":"2026-06-30T02:17:12.866735+00:00"},{"alias_kind":"arxiv_version","alias_value":"2512.10310v2","created_at":"2026-06-30T02:17:12.866735+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.10310","created_at":"2026-06-30T02:17:12.866735+00:00"},{"alias_kind":"pith_short_12","alias_value":"Q5KELL7N4N5X","created_at":"2026-06-30T02:17:12.866735+00:00"},{"alias_kind":"pith_short_16","alias_value":"Q5KELL7N4N5XLKAF","created_at":"2026-06-30T02:17:12.866735+00:00"},{"alias_kind":"pith_short_8","alias_value":"Q5KELL7N","created_at":"2026-06-30T02:17:12.866735+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2605.23257","citing_title":"Turning Adaptation into Assets: Cross-Domain Bridging for Online Vision-Language Navigation","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13169","citing_title":"PanoWorld: Towards Spatial Supersensing in 360$^\\circ$ Panorama World","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2603.07080","citing_title":"VLN-Cache: Enabling Token Caching for VLN Models with Visual/Semantic Dynamics Awareness","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13169","citing_title":"PanoWorld: Towards Spatial Supersensing in 360$^\\circ$ Panorama World","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13328","citing_title":"What Limits Vision-and-Language Navigation ?","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24391","citing_title":"FreqCache: Accelerating Embodied VLN Models with Adaptive Frequency-Guided Token Caching","ref_index":38,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Q5KELL7N4N5XLKAFZPGHKOSAP4","json":"https://pith.science/pith/Q5KELL7N4N5XLKAFZPGHKOSAP4.json","graph_json":"https://pith.science/api/pith-number/Q5KELL7N4N5XLKAFZPGHKOSAP4/graph.json","events_json":"https://pith.science/api/pith-number/Q5KELL7N4N5XLKAFZPGHKOSAP4/events.json","paper":"https://pith.science/paper/Q5KELL7N"},"agent_actions":{"view_html":"https://pith.science/pith/Q5KELL7N4N5XLKAFZPGHKOSAP4","download_json":"https://pith.science/pith/Q5KELL7N4N5XLKAFZPGHKOSAP4.json","view_paper":"https://pith.science/paper/Q5KELL7N","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2512.10310&json=true","fetch_graph":"https://pith.science/api/pith-number/Q5KELL7N4N5XLKAFZPGHKOSAP4/graph.json","fetch_events":"https://pith.science/api/pith-number/Q5KELL7N4N5XLKAFZPGHKOSAP4/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Q5KELL7N4N5XLKAFZPGHKOSAP4/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Q5KELL7N4N5XLKAFZPGHKOSAP4/action/storage_attestation","attest_author":"https://pith.science/pith/Q5KELL7N4N5XLKAFZPGHKOSAP4/action/author_attestation","sign_citation":"https://pith.science/pith/Q5KELL7N4N5XLKAFZPGHKOSAP4/action/citation_signature","submit_replication":"https://pith.science/pith/Q5KELL7N4N5XLKAFZPGHKOSAP4/action/replication_record"}},"created_at":"2026-06-30T02:17:12.866735+00:00","updated_at":"2026-06-30T02:17:12.866735+00:00"}