{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2019:HA6XVOI5HK6HFAMH63BRIKJ5RC","short_pith_number":"pith:HA6XVOI5","canonical_record":{"source":{"id":"1904.03295","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-04-05T21:50:50Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"2d81f14235b8a47cd4d3679c11585405321203cc8c243d68b9b5de7336f77dc2","abstract_canon_sha256":"4a91fe688b6137e4ccdbe2c30ae2ff4455aac11247daea4a57bf0984e7bfb2ed"},"schema_version":"1.0"},"canonical_sha256":"383d7ab91d3abc728187f6c314293d88bf4d345d7b04e7f4ee49cb157664b0fc","source":{"kind":"arxiv","id":"1904.03295","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1904.03295","created_at":"2026-05-17T23:49:13Z"},{"alias_kind":"arxiv_version","alias_value":"1904.03295v1","created_at":"2026-05-17T23:49:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1904.03295","created_at":"2026-05-17T23:49:13Z"},{"alias_kind":"pith_short_12","alias_value":"HA6XVOI5HK6H","created_at":"2026-05-18T12:33:18Z"},{"alias_kind":"pith_short_16","alias_value":"HA6XVOI5HK6HFAMH","created_at":"2026-05-18T12:33:18Z"},{"alias_kind":"pith_short_8","alias_value":"HA6XVOI5","created_at":"2026-05-18T12:33:18Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2019:HA6XVOI5HK6HFAMH63BRIKJ5RC","target":"record","payload":{"canonical_record":{"source":{"id":"1904.03295","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-04-05T21:50:50Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"2d81f14235b8a47cd4d3679c11585405321203cc8c243d68b9b5de7336f77dc2","abstract_canon_sha256":"4a91fe688b6137e4ccdbe2c30ae2ff4455aac11247daea4a57bf0984e7bfb2ed"},"schema_version":"1.0"},"canonical_sha256":"383d7ab91d3abc728187f6c314293d88bf4d345d7b04e7f4ee49cb157664b0fc","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:49:13.936377Z","signature_b64":"EdN5aCMFAA9c3xLnFV98u66KU8MkUwQFazAIpgZifzY3GlGXl5GIDbWAq0CAABSVUxQI2TAOC8nP94WFgDBNBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"383d7ab91d3abc728187f6c314293d88bf4d345d7b04e7f4ee49cb157664b0fc","last_reissued_at":"2026-05-17T23:49:13.935669Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:49:13.935669Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1904.03295","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:49:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"iNdi5TaDH1mQ8F2wTpFHyIsOrYAur32PZx9z3ZnwZ6NbOqklfXteckKtQZJHnw0eN9Z4Cf6Td+HXhmpKGDvmAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T22:26:51.101429Z"},"content_sha256":"afa879f8d9468fb3213dac3a0f2a69e2206f49d2c888df7d797e97cab643c1a8","schema_version":"1.0","event_id":"sha256:afa879f8d9468fb3213dac3a0f2a69e2206f49d2c888df7d797e97cab643c1a8"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2019:HA6XVOI5HK6HFAMH63BRIKJ5RC","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Multi-Preference Actor Critic","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.LG","authors_text":"Adith Swaminathan, Ishan Durugkar, Matthew Hausknecht, Patrick MacAlpine","submitted_at":"2019-04-05T21:50:50Z","abstract_excerpt":"Policy gradient algorithms typically combine discounted future rewards with an estimated value function, to compute the direction and magnitude of parameter updates. However, for most Reinforcement Learning tasks, humans can provide additional insight to constrain the policy learning. We introduce a general method to incorporate multiple different feedback channels into a single policy gradient loss. In our formulation, the Multi-Preference Actor Critic (M-PAC), these different types of feedback are implemented as constraints on the policy. We use a Lagrangian relaxation to satisfy these const"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1904.03295","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:49:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"7nmOkaIskyA5pAYNWbMWCVMKtjVVCnkBi88riSIVv/J4HKayB4YQnwJrJs6CD5JzJCG638o/t2ZWz90zwZX5Aw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T22:26:51.102062Z"},"content_sha256":"4c8f4bb46a530256a97414d4599b471fc5f36a9f3e5776d0f4b2eeb0b8809193","schema_version":"1.0","event_id":"sha256:4c8f4bb46a530256a97414d4599b471fc5f36a9f3e5776d0f4b2eeb0b8809193"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/HA6XVOI5HK6HFAMH63BRIKJ5RC/bundle.json","state_url":"https://pith.science/pith/HA6XVOI5HK6HFAMH63BRIKJ5RC/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/HA6XVOI5HK6HFAMH63BRIKJ5RC/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T22:26:51Z","links":{"resolver":"https://pith.science/pith/HA6XVOI5HK6HFAMH63BRIKJ5RC","bundle":"https://pith.science/pith/HA6XVOI5HK6HFAMH63BRIKJ5RC/bundle.json","state":"https://pith.science/pith/HA6XVOI5HK6HFAMH63BRIKJ5RC/state.json","well_known_bundle":"https://pith.science/.well-known/pith/HA6XVOI5HK6HFAMH63BRIKJ5RC/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2019:HA6XVOI5HK6HFAMH63BRIKJ5RC","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"4a91fe688b6137e4ccdbe2c30ae2ff4455aac11247daea4a57bf0984e7bfb2ed","cross_cats_sorted":["cs.AI","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-04-05T21:50:50Z","title_canon_sha256":"2d81f14235b8a47cd4d3679c11585405321203cc8c243d68b9b5de7336f77dc2"},"schema_version":"1.0","source":{"id":"1904.03295","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1904.03295","created_at":"2026-05-17T23:49:13Z"},{"alias_kind":"arxiv_version","alias_value":"1904.03295v1","created_at":"2026-05-17T23:49:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1904.03295","created_at":"2026-05-17T23:49:13Z"},{"alias_kind":"pith_short_12","alias_value":"HA6XVOI5HK6H","created_at":"2026-05-18T12:33:18Z"},{"alias_kind":"pith_short_16","alias_value":"HA6XVOI5HK6HFAMH","created_at":"2026-05-18T12:33:18Z"},{"alias_kind":"pith_short_8","alias_value":"HA6XVOI5","created_at":"2026-05-18T12:33:18Z"}],"graph_snapshots":[{"event_id":"sha256:4c8f4bb46a530256a97414d4599b471fc5f36a9f3e5776d0f4b2eeb0b8809193","target":"graph","created_at":"2026-05-17T23:49:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Policy gradient algorithms typically combine discounted future rewards with an estimated value function, to compute the direction and magnitude of parameter updates. However, for most Reinforcement Learning tasks, humans can provide additional insight to constrain the policy learning. We introduce a general method to incorporate multiple different feedback channels into a single policy gradient loss. In our formulation, the Multi-Preference Actor Critic (M-PAC), these different types of feedback are implemented as constraints on the policy. We use a Lagrangian relaxation to satisfy these const","authors_text":"Adith Swaminathan, Ishan Durugkar, Matthew Hausknecht, Patrick MacAlpine","cross_cats":["cs.AI","stat.ML"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-04-05T21:50:50Z","title":"Multi-Preference Actor Critic"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1904.03295","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:afa879f8d9468fb3213dac3a0f2a69e2206f49d2c888df7d797e97cab643c1a8","target":"record","created_at":"2026-05-17T23:49:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"4a91fe688b6137e4ccdbe2c30ae2ff4455aac11247daea4a57bf0984e7bfb2ed","cross_cats_sorted":["cs.AI","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-04-05T21:50:50Z","title_canon_sha256":"2d81f14235b8a47cd4d3679c11585405321203cc8c243d68b9b5de7336f77dc2"},"schema_version":"1.0","source":{"id":"1904.03295","kind":"arxiv","version":1}},"canonical_sha256":"383d7ab91d3abc728187f6c314293d88bf4d345d7b04e7f4ee49cb157664b0fc","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"383d7ab91d3abc728187f6c314293d88bf4d345d7b04e7f4ee49cb157664b0fc","first_computed_at":"2026-05-17T23:49:13.935669Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:49:13.935669Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"EdN5aCMFAA9c3xLnFV98u66KU8MkUwQFazAIpgZifzY3GlGXl5GIDbWAq0CAABSVUxQI2TAOC8nP94WFgDBNBQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:49:13.936377Z","signed_message":"canonical_sha256_bytes"},"source_id":"1904.03295","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:afa879f8d9468fb3213dac3a0f2a69e2206f49d2c888df7d797e97cab643c1a8","sha256:4c8f4bb46a530256a97414d4599b471fc5f36a9f3e5776d0f4b2eeb0b8809193"],"state_sha256":"82878df8deac6bebfde40b1e82cabc4ba86fb4d25fa80e755c89b4faa9b562a1"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"g/AA0yQhRhcURMzODRvElSG0NVQwIFb481NU4UmxpMFgBch6BVcztY0f8Fxh2DUbeGxL0I9r/QpTnUyUL+/SDg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T22:26:51.105861Z","bundle_sha256":"cf78afdcc411fa80389959d4923d7671c04f20ee637ce15cb1af92fe3f5a8aae"}}