{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:WSJMQTLSFAFDQYFLE3CH57TXLM","short_pith_number":"pith:WSJMQTLS","schema_version":"1.0","canonical_sha256":"b492c84d72280a3860ab26c47efe775b1dfe6150b37eb2ede70bf42f27efd45d","source":{"kind":"arxiv","id":"1812.08288","version":3},"attestation_state":"computed","paper":{"title":"TD-Regularized Actor-Critic Methods","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.LG","authors_text":"Jan Peters, Mohammad Emtiyaz Khan, Simone Parisi, Voot Tangkaratt","submitted_at":"2018-12-19T23:15:16Z","abstract_excerpt":"Actor-critic methods can achieve incredible performance on difficult reinforcement learning problems, but they are also prone to instability. This is partly due to the interaction between the actor and critic during learning, e.g., an inaccurate step taken by one of them might adversely affect the other and destabilize the learning. To avoid such issues, we propose to regularize the learning objective of the actor by penalizing the temporal difference (TD) error of the critic. This improves stability by avoiding large steps in the actor update whenever the critic is highly inaccurate. The resu"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1812.08288","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-12-19T23:15:16Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"8296d7b177066ed324dedce13ebd3aabd7b525b48fbd40488900680a755c5e03","abstract_canon_sha256":"e1b2c3541e01d6321020744820c39c7240ed3ade08d55d291c4cea76c2f07f3e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:52:51.217279Z","signature_b64":"jxPWBUmPPoPz7MNziabfKb0PXvPmC0wpzKQIEgayIR5Y8j1O28tNNDRZImxB2fRnuwMWQX/+YK5CNyq8jIfyCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b492c84d72280a3860ab26c47efe775b1dfe6150b37eb2ede70bf42f27efd45d","last_reissued_at":"2026-05-17T23:52:51.216501Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:52:51.216501Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"TD-Regularized Actor-Critic Methods","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.LG","authors_text":"Jan Peters, Mohammad Emtiyaz Khan, Simone Parisi, Voot Tangkaratt","submitted_at":"2018-12-19T23:15:16Z","abstract_excerpt":"Actor-critic methods can achieve incredible performance on difficult reinforcement learning problems, but they are also prone to instability. This is partly due to the interaction between the actor and critic during learning, e.g., an inaccurate step taken by one of them might adversely affect the other and destabilize the learning. To avoid such issues, we propose to regularize the learning objective of the actor by penalizing the temporal difference (TD) error of the critic. This improves stability by avoiding large steps in the actor update whenever the critic is highly inaccurate. The resu"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1812.08288","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1812.08288","created_at":"2026-05-17T23:52:51.216631+00:00"},{"alias_kind":"arxiv_version","alias_value":"1812.08288v3","created_at":"2026-05-17T23:52:51.216631+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1812.08288","created_at":"2026-05-17T23:52:51.216631+00:00"},{"alias_kind":"pith_short_12","alias_value":"WSJMQTLSFAFD","created_at":"2026-05-18T12:33:01.666342+00:00"},{"alias_kind":"pith_short_16","alias_value":"WSJMQTLSFAFDQYFL","created_at":"2026-05-18T12:33:01.666342+00:00"},{"alias_kind":"pith_short_8","alias_value":"WSJMQTLS","created_at":"2026-05-18T12:33:01.666342+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WSJMQTLSFAFDQYFLE3CH57TXLM","json":"https://pith.science/pith/WSJMQTLSFAFDQYFLE3CH57TXLM.json","graph_json":"https://pith.science/api/pith-number/WSJMQTLSFAFDQYFLE3CH57TXLM/graph.json","events_json":"https://pith.science/api/pith-number/WSJMQTLSFAFDQYFLE3CH57TXLM/events.json","paper":"https://pith.science/paper/WSJMQTLS"},"agent_actions":{"view_html":"https://pith.science/pith/WSJMQTLSFAFDQYFLE3CH57TXLM","download_json":"https://pith.science/pith/WSJMQTLSFAFDQYFLE3CH57TXLM.json","view_paper":"https://pith.science/paper/WSJMQTLS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1812.08288&json=true","fetch_graph":"https://pith.science/api/pith-number/WSJMQTLSFAFDQYFLE3CH57TXLM/graph.json","fetch_events":"https://pith.science/api/pith-number/WSJMQTLSFAFDQYFLE3CH57TXLM/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WSJMQTLSFAFDQYFLE3CH57TXLM/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WSJMQTLSFAFDQYFLE3CH57TXLM/action/storage_attestation","attest_author":"https://pith.science/pith/WSJMQTLSFAFDQYFLE3CH57TXLM/action/author_attestation","sign_citation":"https://pith.science/pith/WSJMQTLSFAFDQYFLE3CH57TXLM/action/citation_signature","submit_replication":"https://pith.science/pith/WSJMQTLSFAFDQYFLE3CH57TXLM/action/replication_record"}},"created_at":"2026-05-17T23:52:51.216631+00:00","updated_at":"2026-05-17T23:52:51.216631+00:00"}