{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2018:NYG3VZKVUB6VVHRJR3VUFWNGJW","short_pith_number":"pith:NYG3VZKV","canonical_record":{"source":{"id":"1802.10031","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-02-27T17:16:48Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"5322785f1baeff8896106b5ba05258dbae7188b71d354b3fd258b1dc79296307","abstract_canon_sha256":"efede7856388df343b4069f2ad940e138099b827aeb9719d80645991f178c4d1"},"schema_version":"1.0"},"canonical_sha256":"6e0dbae555a07d5a9e298eeb42d9a64da8c256edfc375ee5cf9530b13d718c44","source":{"kind":"arxiv","id":"1802.10031","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1802.10031","created_at":"2026-05-18T00:00:30Z"},{"alias_kind":"arxiv_version","alias_value":"1802.10031v3","created_at":"2026-05-18T00:00:30Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1802.10031","created_at":"2026-05-18T00:00:30Z"},{"alias_kind":"pith_short_12","alias_value":"NYG3VZKVUB6V","created_at":"2026-05-18T12:32:40Z"},{"alias_kind":"pith_short_16","alias_value":"NYG3VZKVUB6VVHRJ","created_at":"2026-05-18T12:32:40Z"},{"alias_kind":"pith_short_8","alias_value":"NYG3VZKV","created_at":"2026-05-18T12:32:40Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2018:NYG3VZKVUB6VVHRJR3VUFWNGJW","target":"record","payload":{"canonical_record":{"source":{"id":"1802.10031","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-02-27T17:16:48Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"5322785f1baeff8896106b5ba05258dbae7188b71d354b3fd258b1dc79296307","abstract_canon_sha256":"efede7856388df343b4069f2ad940e138099b827aeb9719d80645991f178c4d1"},"schema_version":"1.0"},"canonical_sha256":"6e0dbae555a07d5a9e298eeb42d9a64da8c256edfc375ee5cf9530b13d718c44","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:00:30.627626Z","signature_b64":"P8/saTml2Wl+bfkqh76N92J8C397bfwAm8R0GFBN7Zmi9OfvLMVGi0otOcgA4JBOMCf4LAVVcRW0fiROjzzDAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6e0dbae555a07d5a9e298eeb42d9a64da8c256edfc375ee5cf9530b13d718c44","last_reissued_at":"2026-05-18T00:00:30.627163Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:00:30.627163Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1802.10031","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:00:30Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"CO/mZ3XbZ6e3yWoKHiyIMR1W5BRb8BDMCSL5W7rx6m4NMh/HrKKiDbd4CRC1WY5fW6dv2f/5DJ9l9s0iVbdMCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-04T11:09:59.285720Z"},"content_sha256":"9f8f1eb32a2a007525419effd6e0834cf2ddd2cb7ee5bacd01cb10de844115c0","schema_version":"1.0","event_id":"sha256:9f8f1eb32a2a007525419effd6e0834cf2ddd2cb7ee5bacd01cb10de844115c0"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2018:NYG3VZKVUB6VVHRJR3VUFWNGJW","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"The Mirage of Action-Dependent Baselines in Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.LG","authors_text":"George Tucker, Richard E. Turner, Sergey Levine, Shixiang Gu, Surya Bhupatiraju, Zoubin Ghahramani","submitted_at":"2018-02-27T17:16:48Z","abstract_excerpt":"Policy gradient methods are a widely used class of model-free reinforcement learning algorithms where a state-dependent baseline is used to reduce gradient estimator variance. Several recent papers extend the baseline to depend on both the state and action and suggest that this significantly reduces variance and improves sample efficiency without introducing bias into the gradient estimates. To better understand this development, we decompose the variance of the policy gradient estimator and numerically show that learned state-action-dependent baselines do not in fact reduce variance over a st"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1802.10031","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:00:30Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"uq6zlo2kyDRhZtSVJTgkpW6mWgo3IKPGdX8ivWTbVhveEW9goenUziKIoE4kEV3jdt1PIPnTxbFAwS8qStP5BA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-04T11:09:59.286066Z"},"content_sha256":"7493f77fbd6352202e1d76c3f16e17fc01f95e8f52d46bb9c0bc5606d969203c","schema_version":"1.0","event_id":"sha256:7493f77fbd6352202e1d76c3f16e17fc01f95e8f52d46bb9c0bc5606d969203c"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/NYG3VZKVUB6VVHRJR3VUFWNGJW/bundle.json","state_url":"https://pith.science/pith/NYG3VZKVUB6VVHRJR3VUFWNGJW/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/NYG3VZKVUB6VVHRJR3VUFWNGJW/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-04T11:09:59Z","links":{"resolver":"https://pith.science/pith/NYG3VZKVUB6VVHRJR3VUFWNGJW","bundle":"https://pith.science/pith/NYG3VZKVUB6VVHRJR3VUFWNGJW/bundle.json","state":"https://pith.science/pith/NYG3VZKVUB6VVHRJR3VUFWNGJW/state.json","well_known_bundle":"https://pith.science/.well-known/pith/NYG3VZKVUB6VVHRJR3VUFWNGJW/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2018:NYG3VZKVUB6VVHRJR3VUFWNGJW","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"efede7856388df343b4069f2ad940e138099b827aeb9719d80645991f178c4d1","cross_cats_sorted":["stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-02-27T17:16:48Z","title_canon_sha256":"5322785f1baeff8896106b5ba05258dbae7188b71d354b3fd258b1dc79296307"},"schema_version":"1.0","source":{"id":"1802.10031","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1802.10031","created_at":"2026-05-18T00:00:30Z"},{"alias_kind":"arxiv_version","alias_value":"1802.10031v3","created_at":"2026-05-18T00:00:30Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1802.10031","created_at":"2026-05-18T00:00:30Z"},{"alias_kind":"pith_short_12","alias_value":"NYG3VZKVUB6V","created_at":"2026-05-18T12:32:40Z"},{"alias_kind":"pith_short_16","alias_value":"NYG3VZKVUB6VVHRJ","created_at":"2026-05-18T12:32:40Z"},{"alias_kind":"pith_short_8","alias_value":"NYG3VZKV","created_at":"2026-05-18T12:32:40Z"}],"graph_snapshots":[{"event_id":"sha256:7493f77fbd6352202e1d76c3f16e17fc01f95e8f52d46bb9c0bc5606d969203c","target":"graph","created_at":"2026-05-18T00:00:30Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Policy gradient methods are a widely used class of model-free reinforcement learning algorithms where a state-dependent baseline is used to reduce gradient estimator variance. Several recent papers extend the baseline to depend on both the state and action and suggest that this significantly reduces variance and improves sample efficiency without introducing bias into the gradient estimates. To better understand this development, we decompose the variance of the policy gradient estimator and numerically show that learned state-action-dependent baselines do not in fact reduce variance over a st","authors_text":"George Tucker, Richard E. Turner, Sergey Levine, Shixiang Gu, Surya Bhupatiraju, Zoubin Ghahramani","cross_cats":["stat.ML"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-02-27T17:16:48Z","title":"The Mirage of Action-Dependent Baselines in Reinforcement Learning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1802.10031","kind":"arxiv","version":3},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:9f8f1eb32a2a007525419effd6e0834cf2ddd2cb7ee5bacd01cb10de844115c0","target":"record","created_at":"2026-05-18T00:00:30Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"efede7856388df343b4069f2ad940e138099b827aeb9719d80645991f178c4d1","cross_cats_sorted":["stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-02-27T17:16:48Z","title_canon_sha256":"5322785f1baeff8896106b5ba05258dbae7188b71d354b3fd258b1dc79296307"},"schema_version":"1.0","source":{"id":"1802.10031","kind":"arxiv","version":3}},"canonical_sha256":"6e0dbae555a07d5a9e298eeb42d9a64da8c256edfc375ee5cf9530b13d718c44","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"6e0dbae555a07d5a9e298eeb42d9a64da8c256edfc375ee5cf9530b13d718c44","first_computed_at":"2026-05-18T00:00:30.627163Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T00:00:30.627163Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"P8/saTml2Wl+bfkqh76N92J8C397bfwAm8R0GFBN7Zmi9OfvLMVGi0otOcgA4JBOMCf4LAVVcRW0fiROjzzDAg==","signature_status":"signed_v1","signed_at":"2026-05-18T00:00:30.627626Z","signed_message":"canonical_sha256_bytes"},"source_id":"1802.10031","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:9f8f1eb32a2a007525419effd6e0834cf2ddd2cb7ee5bacd01cb10de844115c0","sha256:7493f77fbd6352202e1d76c3f16e17fc01f95e8f52d46bb9c0bc5606d969203c"],"state_sha256":"e853b1902bf679886597e196a6d27ebe447a850fac939dc4833f0a2be434a276"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"fNicVkWLFnG0Im/N/v7XpDdYDA58X/qK3weNO1K1GRtanEWBQUYa7bnfkc/5whxueMCvKUeKn5aZD0zDdiZuDg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-04T11:09:59.288083Z","bundle_sha256":"c357b21e34ec2454049c98b95942f23c3193c0e2bbff074ae524d8e6b6a349b9"}}