{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:RIWLAU73NIFFVIP6XO2C6DBCEB","short_pith_number":"pith:RIWLAU73","schema_version":"1.0","canonical_sha256":"8a2cb053fb6a0a5aa1febbb42f0c222054188b85aced14fc049bc5cdfdabd69d","source":{"kind":"arxiv","id":"2412.13630","version":1},"attestation_state":"computed","paper":{"title":"Policy Decorator: Model-Agnostic Online Refinement for Large Policy Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.RO","authors_text":"Hao Su, Mengke Zhang, Stone Tao, Tongzhou Mu, Xiu Yuan, Yunhao Fang","submitted_at":"2024-12-18T09:06:16Z","abstract_excerpt":"Recent advancements in robot learning have used imitation learning with large models and extensive demonstrations to develop effective policies. However, these models are often limited by the quantity, quality, and diversity of demonstrations. This paper explores improving offline-trained imitation learning models through online interactions with the environment. We introduce Policy Decorator, which uses a model-agnostic residual policy to refine large imitation learning models during online interactions. By implementing controlled exploration strategies, Policy Decorator enables stable, sampl"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2412.13630","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2024-12-18T09:06:16Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"3e62e69d46dfec5db6224bc372f920838ba67e71582ad6b0289b0456fffd90d4","abstract_canon_sha256":"76750d33565347386e6c88dad858c7324a6c29b8cc362fdb85c2b033206e13fd"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T09:51:05.481780Z","signature_b64":"Q3wUQNIsj2xPHqTodHhzfr2tI2CYqUpWHWQwVIwMxEzjXyX08pyR3oyl+b/ZSrT9Kw8vrVMku7fF+QlC27pNDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8a2cb053fb6a0a5aa1febbb42f0c222054188b85aced14fc049bc5cdfdabd69d","last_reissued_at":"2026-07-05T09:51:05.481288Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T09:51:05.481288Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Policy Decorator: Model-Agnostic Online Refinement for Large Policy Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.RO","authors_text":"Hao Su, Mengke Zhang, Stone Tao, Tongzhou Mu, Xiu Yuan, Yunhao Fang","submitted_at":"2024-12-18T09:06:16Z","abstract_excerpt":"Recent advancements in robot learning have used imitation learning with large models and extensive demonstrations to develop effective policies. However, these models are often limited by the quantity, quality, and diversity of demonstrations. This paper explores improving offline-trained imitation learning models through online interactions with the environment. We introduce Policy Decorator, which uses a model-agnostic residual policy to refine large imitation learning models during online interactions. By implementing controlled exploration strategies, Policy Decorator enables stable, sampl"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2412.13630","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2412.13630/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2412.13630","created_at":"2026-07-05T09:51:05.481351+00:00"},{"alias_kind":"arxiv_version","alias_value":"2412.13630v1","created_at":"2026-07-05T09:51:05.481351+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.13630","created_at":"2026-07-05T09:51:05.481351+00:00"},{"alias_kind":"pith_short_12","alias_value":"RIWLAU73NIFF","created_at":"2026-07-05T09:51:05.481351+00:00"},{"alias_kind":"pith_short_16","alias_value":"RIWLAU73NIFFVIP6","created_at":"2026-07-05T09:51:05.481351+00:00"},{"alias_kind":"pith_short_8","alias_value":"RIWLAU73","created_at":"2026-07-05T09:51:05.481351+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":16,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2606.22860","citing_title":"HiL-ResRL: A Model-Agnostic Finetuning Adapter via Human-in-the-loop Residual Reinforcement Learning","ref_index":15,"is_internal_anchor":false},{"citing_arxiv_id":"2606.23640","citing_title":"Learning Process Rewards via Success Visitation Matching for Efficient RL","ref_index":94,"is_internal_anchor":false},{"citing_arxiv_id":"2606.10825","citing_title":"MODIP: Efficient Model-Based Optimization for Diffusion Policies","ref_index":13,"is_internal_anchor":false},{"citing_arxiv_id":"2606.09630","citing_title":"ReCoVLA: VLM-Guided Reward Compilation for Failure Recovery in Vision-Language-Action Policies","ref_index":39,"is_internal_anchor":false},{"citing_arxiv_id":"2606.06461","citing_title":"Flow-based Policy Adaptation without Policy Updates","ref_index":15,"is_internal_anchor":false},{"citing_arxiv_id":"2606.03385","citing_title":"Grasp-Then-Plan with Failure Attribution: A Closed Two-Stage Framework for Precise and Generalizable Robotic Manipulation","ref_index":77,"is_internal_anchor":false},{"citing_arxiv_id":"2606.00269","citing_title":"Closed-Loop Neural Activation Control in Vision-Language-Action Models","ref_index":32,"is_internal_anchor":false},{"citing_arxiv_id":"2606.31958","citing_title":"Adapting Generalist Robot Policies with Semantic Reinforcement Learning","ref_index":14,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05172","citing_title":"When Life Gives You BC, Make Q-functions: Extracting Q-values from Behavior Cloning for On-Robot Reinforcement Learning","ref_index":36,"is_internal_anchor":false},{"citing_arxiv_id":"2602.10503","citing_title":"Towards Long-Lived Robots: Continual Learning VLA Models via Reinforcement Fine-Tuning","ref_index":73,"is_internal_anchor":false},{"citing_arxiv_id":"2605.17486","citing_title":"DyGRO-VLA: Cross-Task Scaling of Vision-Language-Action Models via Dynamic Grouped Residual Optimization","ref_index":187,"is_internal_anchor":false},{"citing_arxiv_id":"2507.07986","citing_title":"EXPO: Stable Reinforcement Learning with Expressive Policies","ref_index":27,"is_internal_anchor":false},{"citing_arxiv_id":"2506.15799","citing_title":"Steering Your Diffusion Policy with Latent Space Reinforcement Learning","ref_index":65,"is_internal_anchor":false},{"citing_arxiv_id":"2603.15956","citing_title":"ExpertGen: Scalable Sim-to-Real Expert Policy Learning from Imperfect Behavior Priors","ref_index":32,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05172","citing_title":"When Life Gives You BC, Make Q-functions: Extracting Q-values from Behavior Cloning for On-Robot Reinforcement Learning","ref_index":54,"is_internal_anchor":false},{"citing_arxiv_id":"2604.17919","citing_title":"Fisher Decorator: Refining Flow Policy via a Local Transport Map","ref_index":71,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/RIWLAU73NIFFVIP6XO2C6DBCEB","json":"https://pith.science/pith/RIWLAU73NIFFVIP6XO2C6DBCEB.json","graph_json":"https://pith.science/api/pith-number/RIWLAU73NIFFVIP6XO2C6DBCEB/graph.json","events_json":"https://pith.science/api/pith-number/RIWLAU73NIFFVIP6XO2C6DBCEB/events.json","paper":"https://pith.science/paper/RIWLAU73"},"agent_actions":{"view_html":"https://pith.science/pith/RIWLAU73NIFFVIP6XO2C6DBCEB","download_json":"https://pith.science/pith/RIWLAU73NIFFVIP6XO2C6DBCEB.json","view_paper":"https://pith.science/paper/RIWLAU73","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2412.13630&json=true","fetch_graph":"https://pith.science/api/pith-number/RIWLAU73NIFFVIP6XO2C6DBCEB/graph.json","fetch_events":"https://pith.science/api/pith-number/RIWLAU73NIFFVIP6XO2C6DBCEB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/RIWLAU73NIFFVIP6XO2C6DBCEB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/RIWLAU73NIFFVIP6XO2C6DBCEB/action/storage_attestation","attest_author":"https://pith.science/pith/RIWLAU73NIFFVIP6XO2C6DBCEB/action/author_attestation","sign_citation":"https://pith.science/pith/RIWLAU73NIFFVIP6XO2C6DBCEB/action/citation_signature","submit_replication":"https://pith.science/pith/RIWLAU73NIFFVIP6XO2C6DBCEB/action/replication_record"}},"created_at":"2026-07-05T09:51:05.481351+00:00","updated_at":"2026-07-05T09:51:05.481351+00:00"}