{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:VXM2BKV7YYCQDWWK67JZCWKQ6P","short_pith_number":"pith:VXM2BKV7","schema_version":"1.0","canonical_sha256":"add9a0aabfc60501dacaf7d3915950f3d2833985de9603179f66430483fcf33c","source":{"kind":"arxiv","id":"1802.05313","version":2},"attestation_state":"computed","paper":{"title":"Reinforcement Learning from Imperfect Demonstrations","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.AI","authors_text":"Fisher Yu, Huazhe Xu, Ji Lin, Sergey Levine, Trevor Darrell, Yang Gao","submitted_at":"2018-02-14T20:37:38Z","abstract_excerpt":"Robust real-world learning should benefit from both demonstrations and interactions with the environment. Current approaches to learning from demonstration and reward perform supervised learning on expert demonstration data and use reinforcement learning to further improve performance based on the reward received from the environment. These tasks have divergent losses which are difficult to jointly optimize and such methods can be very sensitive to noisy demonstrations. We propose a unified reinforcement learning algorithm, Normalized Actor-Critic (NAC), that effectively normalizes the Q-funct"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1802.05313","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2018-02-14T20:37:38Z","cross_cats_sorted":["cs.LG","stat.ML"],"title_canon_sha256":"639c1af08e77f462cab2588e786eedd42118a45d66a00dd6b043ca9526f26c5d","abstract_canon_sha256":"a6ab0d1f8b0472cf8762d2099a08e135ff0c38151d379622d48a0bbad95fefba"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:44:43.424668Z","signature_b64":"hVtw9c554+TCs5pqjdb0WJDw2fkRq2Kh83dxbq9IH4HvhVbvAWhkZg1NO5X9pLsiKO9VZBM/UTHbg9HR1C17BQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"add9a0aabfc60501dacaf7d3915950f3d2833985de9603179f66430483fcf33c","last_reissued_at":"2026-05-17T23:44:43.424202Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:44:43.424202Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Reinforcement Learning from Imperfect Demonstrations","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.AI","authors_text":"Fisher Yu, Huazhe Xu, Ji Lin, Sergey Levine, Trevor Darrell, Yang Gao","submitted_at":"2018-02-14T20:37:38Z","abstract_excerpt":"Robust real-world learning should benefit from both demonstrations and interactions with the environment. Current approaches to learning from demonstration and reward perform supervised learning on expert demonstration data and use reinforcement learning to further improve performance based on the reward received from the environment. These tasks have divergent losses which are difficult to jointly optimize and such methods can be very sensitive to noisy demonstrations. We propose a unified reinforcement learning algorithm, Normalized Actor-Critic (NAC), that effectively normalizes the Q-funct"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1802.05313","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1802.05313","created_at":"2026-05-17T23:44:43.424274+00:00"},{"alias_kind":"arxiv_version","alias_value":"1802.05313v2","created_at":"2026-05-17T23:44:43.424274+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1802.05313","created_at":"2026-05-17T23:44:43.424274+00:00"},{"alias_kind":"pith_short_12","alias_value":"VXM2BKV7YYCQ","created_at":"2026-05-18T12:32:59.047623+00:00"},{"alias_kind":"pith_short_16","alias_value":"VXM2BKV7YYCQDWWK","created_at":"2026-05-18T12:32:59.047623+00:00"},{"alias_kind":"pith_short_8","alias_value":"VXM2BKV7","created_at":"2026-05-18T12:32:59.047623+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"1907.02874","citing_title":"Attentive Multi-Task Deep Reinforcement Learning","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2106.01345","citing_title":"Decision Transformer: Reinforcement Learning via Sequence Modeling","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2510.09096","citing_title":"When a Robot is More Capable than a Human: Learning from Constrained Demonstrators","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2006.09359","citing_title":"AWAC: Accelerating Online Reinforcement Learning with Offline Datasets","ref_index":15,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VXM2BKV7YYCQDWWK67JZCWKQ6P","json":"https://pith.science/pith/VXM2BKV7YYCQDWWK67JZCWKQ6P.json","graph_json":"https://pith.science/api/pith-number/VXM2BKV7YYCQDWWK67JZCWKQ6P/graph.json","events_json":"https://pith.science/api/pith-number/VXM2BKV7YYCQDWWK67JZCWKQ6P/events.json","paper":"https://pith.science/paper/VXM2BKV7"},"agent_actions":{"view_html":"https://pith.science/pith/VXM2BKV7YYCQDWWK67JZCWKQ6P","download_json":"https://pith.science/pith/VXM2BKV7YYCQDWWK67JZCWKQ6P.json","view_paper":"https://pith.science/paper/VXM2BKV7","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1802.05313&json=true","fetch_graph":"https://pith.science/api/pith-number/VXM2BKV7YYCQDWWK67JZCWKQ6P/graph.json","fetch_events":"https://pith.science/api/pith-number/VXM2BKV7YYCQDWWK67JZCWKQ6P/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VXM2BKV7YYCQDWWK67JZCWKQ6P/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VXM2BKV7YYCQDWWK67JZCWKQ6P/action/storage_attestation","attest_author":"https://pith.science/pith/VXM2BKV7YYCQDWWK67JZCWKQ6P/action/author_attestation","sign_citation":"https://pith.science/pith/VXM2BKV7YYCQDWWK67JZCWKQ6P/action/citation_signature","submit_replication":"https://pith.science/pith/VXM2BKV7YYCQDWWK67JZCWKQ6P/action/replication_record"}},"created_at":"2026-05-17T23:44:43.424274+00:00","updated_at":"2026-05-17T23:44:43.424274+00:00"}