{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:AGUXAQUTVZELLDTINOAEZRIJXS","short_pith_number":"pith:AGUXAQUT","schema_version":"1.0","canonical_sha256":"01a9704293ae48b58e686b804cc509bc8c58b7737730f3547e4f42766433994e","source":{"kind":"arxiv","id":"1707.08817","version":2},"attestation_state":"computed","paper":{"title":"Leveraging Demonstrations for Deep Reinforcement Learning on Robotics Problems with Sparse Rewards","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Bilal Piot, Fumin Wang, Jonathan Scholz, Martin Riedmiller, Mel Vecerik, Nicolas Heess, Olivier Pietquin, Thomas Lampe, Thomas Roth\\\"orl, Todd Hester","submitted_at":"2017-07-27T11:16:53Z","abstract_excerpt":"We propose a general and model-free approach for Reinforcement Learning (RL) on real robotics with sparse rewards. We build upon the Deep Deterministic Policy Gradient (DDPG) algorithm to use demonstrations. Both demonstrations and actual interactions are used to fill a replay buffer and the sampling ratio between demonstrations and transitions is automatically tuned via a prioritized replay mechanism. Typically, carefully engineered shaping rewards are required to enable the agents to efficiently explore on high dimensional control problems such as robotics. They are also required for model-b"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1707.08817","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2017-07-27T11:16:53Z","cross_cats_sorted":[],"title_canon_sha256":"31ac8aee2c5377055d09451b8f60bc35597766337c2f1bf9e706bd7b70fc0626","abstract_canon_sha256":"ea901a17e7aef1852fbb4497579d073b929292d3d7d71c718409ea28403baee4"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:03:56.930530Z","signature_b64":"NFTM+TiM2VeaqWZ23ynM7Z+maXOx3JfPaUgPFc6mgG6q3A42EDCyd9xAice1eIsXE22FTm45c9YjFVLP00JCBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"01a9704293ae48b58e686b804cc509bc8c58b7737730f3547e4f42766433994e","last_reissued_at":"2026-05-18T00:03:56.929806Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:03:56.929806Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Leveraging Demonstrations for Deep Reinforcement Learning on Robotics Problems with Sparse Rewards","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Bilal Piot, Fumin Wang, Jonathan Scholz, Martin Riedmiller, Mel Vecerik, Nicolas Heess, Olivier Pietquin, Thomas Lampe, Thomas Roth\\\"orl, Todd Hester","submitted_at":"2017-07-27T11:16:53Z","abstract_excerpt":"We propose a general and model-free approach for Reinforcement Learning (RL) on real robotics with sparse rewards. We build upon the Deep Deterministic Policy Gradient (DDPG) algorithm to use demonstrations. Both demonstrations and actual interactions are used to fill a replay buffer and the sampling ratio between demonstrations and transitions is automatically tuned via a prioritized replay mechanism. Typically, carefully engineered shaping rewards are required to enable the agents to efficiently explore on high dimensional control problems such as robotics. They are also required for model-b"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1707.08817","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1707.08817","created_at":"2026-05-18T00:03:56.929931+00:00"},{"alias_kind":"arxiv_version","alias_value":"1707.08817v2","created_at":"2026-05-18T00:03:56.929931+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1707.08817","created_at":"2026-05-18T00:03:56.929931+00:00"},{"alias_kind":"pith_short_12","alias_value":"AGUXAQUTVZEL","created_at":"2026-05-18T12:31:05.417338+00:00"},{"alias_kind":"pith_short_16","alias_value":"AGUXAQUTVZELLDTI","created_at":"2026-05-18T12:31:05.417338+00:00"},{"alias_kind":"pith_short_8","alias_value":"AGUXAQUT","created_at":"2026-05-18T12:31:05.417338+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":17,"internal_anchor_count":8,"sample":[{"citing_arxiv_id":"1906.10124","citing_title":"On Multi-Agent Learning in Team Sports Games","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05863","citing_title":"SOPE: Stabilizing Off-Policy Evaluation for Online RL with Prior Data","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19592","citing_title":"Implicit Action Chunking for Smooth Continuous Control","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2507.07986","citing_title":"EXPO: Stable Reinforcement Learning with Expressive Policies","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2509.16615","citing_title":"LLM-Guided Task- and Affordance-Level Exploration in Reinforcement Learning","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2505.18719","citing_title":"VLA-RL: Towards Masterful and General Robotic Manipulation with Scalable Reinforcement Learning","ref_index":71,"is_internal_anchor":true},{"citing_arxiv_id":"2409.00588","citing_title":"Diffusion Policy Policy Optimization","ref_index":95,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14497","citing_title":"ROAD: Adaptive Data Mixing for Offline-to-Online Reinforcement Learning via Bi-Level Optimization","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12004","citing_title":"Learning Agentic Policy from Action Guidance","ref_index":59,"is_internal_anchor":false},{"citing_arxiv_id":"2605.11387","citing_title":"Behavioral Mode Discovery for Fine-tuning Multimodal Generative Policies","ref_index":35,"is_internal_anchor":false},{"citing_arxiv_id":"2605.10734","citing_title":"XQCfD: Accelerating Fast Actor-Critic Algorithms with Prior Data and Prior Policies","ref_index":41,"is_internal_anchor":false},{"citing_arxiv_id":"2605.06228","citing_title":"Soft Deterministic Policy Gradient with Gaussian Smoothing","ref_index":28,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05863","citing_title":"SOPE: Stabilizing Off-Policy Evaluation for Online RL with Prior Data","ref_index":16,"is_internal_anchor":false},{"citing_arxiv_id":"2605.04185","citing_title":"Constraint-Enhanced Reinforcement Learning Based on Dynamic Decoupled Spherical Radial Squashing","ref_index":23,"is_internal_anchor":false},{"citing_arxiv_id":"2604.08036","citing_title":"PriPG-RL: Privileged Planner-Guided Reinforcement Learning for Partially Observable Systems with Anytime-Feasible MPC","ref_index":23,"is_internal_anchor":false},{"citing_arxiv_id":"2604.13733","citing_title":"Jump-Start Reinforcement Learning with Vision-Language-Action Regularization","ref_index":43,"is_internal_anchor":false},{"citing_arxiv_id":"2605.01729","citing_title":"Stable GFlowNets with Probabilistic Guarantees","ref_index":25,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/AGUXAQUTVZELLDTINOAEZRIJXS","json":"https://pith.science/pith/AGUXAQUTVZELLDTINOAEZRIJXS.json","graph_json":"https://pith.science/api/pith-number/AGUXAQUTVZELLDTINOAEZRIJXS/graph.json","events_json":"https://pith.science/api/pith-number/AGUXAQUTVZELLDTINOAEZRIJXS/events.json","paper":"https://pith.science/paper/AGUXAQUT"},"agent_actions":{"view_html":"https://pith.science/pith/AGUXAQUTVZELLDTINOAEZRIJXS","download_json":"https://pith.science/pith/AGUXAQUTVZELLDTINOAEZRIJXS.json","view_paper":"https://pith.science/paper/AGUXAQUT","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1707.08817&json=true","fetch_graph":"https://pith.science/api/pith-number/AGUXAQUTVZELLDTINOAEZRIJXS/graph.json","fetch_events":"https://pith.science/api/pith-number/AGUXAQUTVZELLDTINOAEZRIJXS/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/AGUXAQUTVZELLDTINOAEZRIJXS/action/timestamp_anchor","attest_storage":"https://pith.science/pith/AGUXAQUTVZELLDTINOAEZRIJXS/action/storage_attestation","attest_author":"https://pith.science/pith/AGUXAQUTVZELLDTINOAEZRIJXS/action/author_attestation","sign_citation":"https://pith.science/pith/AGUXAQUTVZELLDTINOAEZRIJXS/action/citation_signature","submit_replication":"https://pith.science/pith/AGUXAQUTVZELLDTINOAEZRIJXS/action/replication_record"}},"created_at":"2026-05-18T00:03:56.929931+00:00","updated_at":"2026-05-18T00:03:56.929931+00:00"}