{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:KY23CKV637JEFOY3F2U5FDNAQC","short_pith_number":"pith:KY23CKV6","schema_version":"1.0","canonical_sha256":"5635b12abedfd242bb1b2ea9d28da0809f33b00a6fcecd2d4e3503b08d3bf58f","source":{"kind":"arxiv","id":"2512.20014","version":3},"attestation_state":"computed","paper":{"title":"Bring My Cup! Personalizing Vision-Language-Action Models with Visual Attentive Prompting","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.RO","authors_text":"Sangoh Lee, Sangwoo Mo, Wook-Shin Han","submitted_at":"2025-12-23T03:13:39Z","abstract_excerpt":"While Vision-Language-Action (VLA) models generalize well to generic instructions, they struggle with personalized commands such as \"bring my cup,\" where the robot must act on one specific instance among visually similar objects. We study this setting of manipulating personal objects, in which a VLA must identify and control a user-specific object unseen during training using only a few reference images. To address this challenge, we propose Visual Attentive Prompting (VAP), a simple-yet-effective training-free perceptual adapter that equips frozen VLAs with top-down selective attention. VAP t"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2512.20014","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2025-12-23T03:13:39Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"04329177678edda72c593fe443511136decad420fcd53209fd5044451845ccb3","abstract_canon_sha256":"1973bf105367297a6479504a9c8dee184b27763e5249c97d7439a0f5f61ab1f5"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:12:49.999896Z","signature_b64":"NmqZ6t6/haveOB5T0sxSz2vdP3OPQQR7SEZeNPiUI6c64adHaSim9HjNd2ySr2ktHGxGuQSfS9KlntWwR8vcDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5635b12abedfd242bb1b2ea9d28da0809f33b00a6fcecd2d4e3503b08d3bf58f","last_reissued_at":"2026-06-19T16:12:49.999535Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:12:49.999535Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Bring My Cup! Personalizing Vision-Language-Action Models with Visual Attentive Prompting","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.RO","authors_text":"Sangoh Lee, Sangwoo Mo, Wook-Shin Han","submitted_at":"2025-12-23T03:13:39Z","abstract_excerpt":"While Vision-Language-Action (VLA) models generalize well to generic instructions, they struggle with personalized commands such as \"bring my cup,\" where the robot must act on one specific instance among visually similar objects. We study this setting of manipulating personal objects, in which a VLA must identify and control a user-specific object unseen during training using only a few reference images. To address this challenge, we propose Visual Attentive Prompting (VAP), a simple-yet-effective training-free perceptual adapter that equips frozen VLAs with top-down selective attention. VAP t"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2512.20014","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2512.20014/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2512.20014","created_at":"2026-06-19T16:12:49.999606+00:00"},{"alias_kind":"arxiv_version","alias_value":"2512.20014v3","created_at":"2026-06-19T16:12:49.999606+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.20014","created_at":"2026-06-19T16:12:49.999606+00:00"},{"alias_kind":"pith_short_12","alias_value":"KY23CKV637JE","created_at":"2026-06-19T16:12:49.999606+00:00"},{"alias_kind":"pith_short_16","alias_value":"KY23CKV637JEFOY3","created_at":"2026-06-19T16:12:49.999606+00:00"},{"alias_kind":"pith_short_8","alias_value":"KY23CKV6","created_at":"2026-06-19T16:12:49.999606+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2603.22003","citing_title":"VP-VLA: Visual Prompting as an Interface for Vision-Language-Action Models","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12160","citing_title":"Premover: Fast Vision-Language-Action Control by Acting Before Instructions Are Complete","ref_index":10,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/KY23CKV637JEFOY3F2U5FDNAQC","json":"https://pith.science/pith/KY23CKV637JEFOY3F2U5FDNAQC.json","graph_json":"https://pith.science/api/pith-number/KY23CKV637JEFOY3F2U5FDNAQC/graph.json","events_json":"https://pith.science/api/pith-number/KY23CKV637JEFOY3F2U5FDNAQC/events.json","paper":"https://pith.science/paper/KY23CKV6"},"agent_actions":{"view_html":"https://pith.science/pith/KY23CKV637JEFOY3F2U5FDNAQC","download_json":"https://pith.science/pith/KY23CKV637JEFOY3F2U5FDNAQC.json","view_paper":"https://pith.science/paper/KY23CKV6","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2512.20014&json=true","fetch_graph":"https://pith.science/api/pith-number/KY23CKV637JEFOY3F2U5FDNAQC/graph.json","fetch_events":"https://pith.science/api/pith-number/KY23CKV637JEFOY3F2U5FDNAQC/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/KY23CKV637JEFOY3F2U5FDNAQC/action/timestamp_anchor","attest_storage":"https://pith.science/pith/KY23CKV637JEFOY3F2U5FDNAQC/action/storage_attestation","attest_author":"https://pith.science/pith/KY23CKV637JEFOY3F2U5FDNAQC/action/author_attestation","sign_citation":"https://pith.science/pith/KY23CKV637JEFOY3F2U5FDNAQC/action/citation_signature","submit_replication":"https://pith.science/pith/KY23CKV637JEFOY3F2U5FDNAQC/action/replication_record"}},"created_at":"2026-06-19T16:12:49.999606+00:00","updated_at":"2026-06-19T16:12:49.999606+00:00"}