{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2018:DE5B74PPXXJL3ZFCTRMWLPA43V","short_pith_number":"pith:DE5B74PP","canonical_record":{"source":{"id":"1807.00403","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-07-01T21:52:07Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"4f6ce64f572297972b07a83018543ba89a194401dd8ae50613c0ae835ac88b0c","abstract_canon_sha256":"540be58be0ad6e45a7aaf81fba8bfa84407d2345c0c05a97e0211390a47425c3"},"schema_version":"1.0"},"canonical_sha256":"193a1ff1efbdd2bde4a29c5965bc1cdd6dccff3e19f1030bb496d535a96b67e5","source":{"kind":"arxiv","id":"1807.00403","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1807.00403","created_at":"2026-05-18T00:11:29Z"},{"alias_kind":"arxiv_version","alias_value":"1807.00403v2","created_at":"2026-05-18T00:11:29Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1807.00403","created_at":"2026-05-18T00:11:29Z"},{"alias_kind":"pith_short_12","alias_value":"DE5B74PPXXJL","created_at":"2026-05-18T12:32:19Z"},{"alias_kind":"pith_short_16","alias_value":"DE5B74PPXXJL3ZFC","created_at":"2026-05-18T12:32:19Z"},{"alias_kind":"pith_short_8","alias_value":"DE5B74PP","created_at":"2026-05-18T12:32:19Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2018:DE5B74PPXXJL3ZFCTRMWLPA43V","target":"record","payload":{"canonical_record":{"source":{"id":"1807.00403","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-07-01T21:52:07Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"4f6ce64f572297972b07a83018543ba89a194401dd8ae50613c0ae835ac88b0c","abstract_canon_sha256":"540be58be0ad6e45a7aaf81fba8bfa84407d2345c0c05a97e0211390a47425c3"},"schema_version":"1.0"},"canonical_sha256":"193a1ff1efbdd2bde4a29c5965bc1cdd6dccff3e19f1030bb496d535a96b67e5","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:11:29.944759Z","signature_b64":"pAFh3vv8n4B+JOuERf1+c/mPK/KfJTmqoTuthqDbnK0jhSXpaHRa9gjxLctG4Uhjb69vqubZklQ4JmcyvyRiBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"193a1ff1efbdd2bde4a29c5965bc1cdd6dccff3e19f1030bb496d535a96b67e5","last_reissued_at":"2026-05-18T00:11:29.944190Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:11:29.944190Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1807.00403","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:11:29Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"qbXc8eSisOotzUfrTkycNBd4Wxfk9YjhufGd9mfY+1WiXK0EDPrx4N3KgOdl7P9Fn+3Yk1bXMzVgpKWU2EJ1DQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T12:23:36.506699Z"},"content_sha256":"706bb4e1f9157acb674175c1f5f13b0ca8980e65104f07fe9796967b3c45e941","schema_version":"1.0","event_id":"sha256:706bb4e1f9157acb674175c1f5f13b0ca8980e65104f07fe9796967b3c45e941"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2018:DE5B74PPXXJL3ZFCTRMWLPA43V","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Towards Mixed Optimization for Reinforcement Learning with Program Synthesis","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.LG","authors_text":"Kumar Krishna Agrawal, Rishabh Singh, Surya Bhupatiraju","submitted_at":"2018-07-01T21:52:07Z","abstract_excerpt":"Deep reinforcement learning has led to several recent breakthroughs, though the learned policies are often based on black-box neural networks. This makes them difficult to interpret and to impose desired specification constraints during learning. We present an iterative framework, MORL, for improving the learned policies using program synthesis. Concretely, we propose to use synthesis techniques to obtain a symbolic representation of the learned policy, which can then be debugged manually or automatically using program repair. After the repair step, we use behavior cloning to obtain the policy"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1807.00403","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:11:29Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"DvGo2/1XE3Xt4KzED7AlOWdBlX7GPyx9SgHCnOkUCRSVLyP6wWNQix7X/sCOwBGZSgNVIHMaF1RsL22QC1TTDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T12:23:36.507049Z"},"content_sha256":"075e13a9e9d85b0f1d4da0d6e589d67b9bb7376a2e8a6b20e890fd474ff3166d","schema_version":"1.0","event_id":"sha256:075e13a9e9d85b0f1d4da0d6e589d67b9bb7376a2e8a6b20e890fd474ff3166d"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/DE5B74PPXXJL3ZFCTRMWLPA43V/bundle.json","state_url":"https://pith.science/pith/DE5B74PPXXJL3ZFCTRMWLPA43V/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/DE5B74PPXXJL3ZFCTRMWLPA43V/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-02T12:23:36Z","links":{"resolver":"https://pith.science/pith/DE5B74PPXXJL3ZFCTRMWLPA43V","bundle":"https://pith.science/pith/DE5B74PPXXJL3ZFCTRMWLPA43V/bundle.json","state":"https://pith.science/pith/DE5B74PPXXJL3ZFCTRMWLPA43V/state.json","well_known_bundle":"https://pith.science/.well-known/pith/DE5B74PPXXJL3ZFCTRMWLPA43V/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2018:DE5B74PPXXJL3ZFCTRMWLPA43V","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"540be58be0ad6e45a7aaf81fba8bfa84407d2345c0c05a97e0211390a47425c3","cross_cats_sorted":["cs.AI","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-07-01T21:52:07Z","title_canon_sha256":"4f6ce64f572297972b07a83018543ba89a194401dd8ae50613c0ae835ac88b0c"},"schema_version":"1.0","source":{"id":"1807.00403","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1807.00403","created_at":"2026-05-18T00:11:29Z"},{"alias_kind":"arxiv_version","alias_value":"1807.00403v2","created_at":"2026-05-18T00:11:29Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1807.00403","created_at":"2026-05-18T00:11:29Z"},{"alias_kind":"pith_short_12","alias_value":"DE5B74PPXXJL","created_at":"2026-05-18T12:32:19Z"},{"alias_kind":"pith_short_16","alias_value":"DE5B74PPXXJL3ZFC","created_at":"2026-05-18T12:32:19Z"},{"alias_kind":"pith_short_8","alias_value":"DE5B74PP","created_at":"2026-05-18T12:32:19Z"}],"graph_snapshots":[{"event_id":"sha256:075e13a9e9d85b0f1d4da0d6e589d67b9bb7376a2e8a6b20e890fd474ff3166d","target":"graph","created_at":"2026-05-18T00:11:29Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Deep reinforcement learning has led to several recent breakthroughs, though the learned policies are often based on black-box neural networks. This makes them difficult to interpret and to impose desired specification constraints during learning. We present an iterative framework, MORL, for improving the learned policies using program synthesis. Concretely, we propose to use synthesis techniques to obtain a symbolic representation of the learned policy, which can then be debugged manually or automatically using program repair. After the repair step, we use behavior cloning to obtain the policy","authors_text":"Kumar Krishna Agrawal, Rishabh Singh, Surya Bhupatiraju","cross_cats":["cs.AI","stat.ML"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-07-01T21:52:07Z","title":"Towards Mixed Optimization for Reinforcement Learning with Program Synthesis"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1807.00403","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:706bb4e1f9157acb674175c1f5f13b0ca8980e65104f07fe9796967b3c45e941","target":"record","created_at":"2026-05-18T00:11:29Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"540be58be0ad6e45a7aaf81fba8bfa84407d2345c0c05a97e0211390a47425c3","cross_cats_sorted":["cs.AI","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-07-01T21:52:07Z","title_canon_sha256":"4f6ce64f572297972b07a83018543ba89a194401dd8ae50613c0ae835ac88b0c"},"schema_version":"1.0","source":{"id":"1807.00403","kind":"arxiv","version":2}},"canonical_sha256":"193a1ff1efbdd2bde4a29c5965bc1cdd6dccff3e19f1030bb496d535a96b67e5","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"193a1ff1efbdd2bde4a29c5965bc1cdd6dccff3e19f1030bb496d535a96b67e5","first_computed_at":"2026-05-18T00:11:29.944190Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T00:11:29.944190Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"pAFh3vv8n4B+JOuERf1+c/mPK/KfJTmqoTuthqDbnK0jhSXpaHRa9gjxLctG4Uhjb69vqubZklQ4JmcyvyRiBg==","signature_status":"signed_v1","signed_at":"2026-05-18T00:11:29.944759Z","signed_message":"canonical_sha256_bytes"},"source_id":"1807.00403","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:706bb4e1f9157acb674175c1f5f13b0ca8980e65104f07fe9796967b3c45e941","sha256:075e13a9e9d85b0f1d4da0d6e589d67b9bb7376a2e8a6b20e890fd474ff3166d"],"state_sha256":"d92679a170bace72bdab97efb17b6f29e7bc53e8b71e87061594f9094c0ee979"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"V6Zs22ScgVY0Udfc8y8i6kzro+jZEP3iQny/WNtRbr8brEU7X5p5TMpRu4W+i+SUuVrXIfewEpvf9u1Xtn62DQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-02T12:23:36.508908Z","bundle_sha256":"e74046264c7bd23d8278af7c01882745b5457adee490f93b254db17cb6f39b44"}}