{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:4AFE7FBJV5UBPYKKVLIWRRJQNP","short_pith_number":"pith:4AFE7FBJ","schema_version":"1.0","canonical_sha256":"e00a4f9429af6817e14aaad168c5306bd37320723e3a67a534845eb8f0814062","source":{"kind":"arxiv","id":"1801.08099","version":8},"attestation_state":"computed","paper":{"title":"Logically-Constrained Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LO"],"primary_cat":"cs.LG","authors_text":"Alessandro Abate, Daniel Kroening, Mohammadhosein Hasanbeig","submitted_at":"2018-01-24T17:50:30Z","abstract_excerpt":"We present the first model-free Reinforcement Learning (RL) algorithm to synthesise policies for an unknown Markov Decision Process (MDP), such that a linear time property is satisfied. The given temporal property is converted into a Limit Deterministic Buchi Automaton (LDBA) and a robust reward function is defined over the state-action pairs of the MDP according to the resulting LDBA. With this reward function, the policy synthesis procedure is \"constrained\" by the given specification. These constraints guide the MDP exploration so as to minimize the solution time by only considering the port"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1801.08099","kind":"arxiv","version":8},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-01-24T17:50:30Z","cross_cats_sorted":["cs.LO"],"title_canon_sha256":"17a79c517fd8f4f4c74343016894200e6a4b96fc905801c466e3ca490f8ec770","abstract_canon_sha256":"15672051266b40fb5fba9ac7c2ef44c8ce5cc885c15674307fb95e95a02f13c6"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:53:51.758126Z","signature_b64":"Jvyh4+LwM/Vv2lYl1gOpOsZWeFgu0jAE3PxU050u78jjYBbRyLOn7lYY0bkhXlHjJ8DAO6cUClGoynDf+vjWCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e00a4f9429af6817e14aaad168c5306bd37320723e3a67a534845eb8f0814062","last_reissued_at":"2026-05-17T23:53:51.757497Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:53:51.757497Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Logically-Constrained Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LO"],"primary_cat":"cs.LG","authors_text":"Alessandro Abate, Daniel Kroening, Mohammadhosein Hasanbeig","submitted_at":"2018-01-24T17:50:30Z","abstract_excerpt":"We present the first model-free Reinforcement Learning (RL) algorithm to synthesise policies for an unknown Markov Decision Process (MDP), such that a linear time property is satisfied. The given temporal property is converted into a Limit Deterministic Buchi Automaton (LDBA) and a robust reward function is defined over the state-action pairs of the MDP according to the resulting LDBA. With this reward function, the policy synthesis procedure is \"constrained\" by the given specification. These constraints guide the MDP exploration so as to minimize the solution time by only considering the port"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1801.08099","kind":"arxiv","version":8},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1801.08099","created_at":"2026-05-17T23:53:51.757615+00:00"},{"alias_kind":"arxiv_version","alias_value":"1801.08099v8","created_at":"2026-05-17T23:53:51.757615+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1801.08099","created_at":"2026-05-17T23:53:51.757615+00:00"},{"alias_kind":"pith_short_12","alias_value":"4AFE7FBJV5UB","created_at":"2026-05-18T12:32:05.422762+00:00"},{"alias_kind":"pith_short_16","alias_value":"4AFE7FBJV5UBPYKK","created_at":"2026-05-18T12:32:05.422762+00:00"},{"alias_kind":"pith_short_8","alias_value":"4AFE7FBJ","created_at":"2026-05-18T12:32:05.422762+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"1909.07299","citing_title":"Control Synthesis from Linear Temporal Logic Specifications using Model-Free Reinforcement Learning","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16198","citing_title":"Formal Methods Meet LLMs: Auditing, Monitoring, and Intervention for Compliance of Advanced AI Systems","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24729","citing_title":"SpecRLBench: A Benchmark for Generalization in Specification-Guided Reinforcement Learning","ref_index":4,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4AFE7FBJV5UBPYKKVLIWRRJQNP","json":"https://pith.science/pith/4AFE7FBJV5UBPYKKVLIWRRJQNP.json","graph_json":"https://pith.science/api/pith-number/4AFE7FBJV5UBPYKKVLIWRRJQNP/graph.json","events_json":"https://pith.science/api/pith-number/4AFE7FBJV5UBPYKKVLIWRRJQNP/events.json","paper":"https://pith.science/paper/4AFE7FBJ"},"agent_actions":{"view_html":"https://pith.science/pith/4AFE7FBJV5UBPYKKVLIWRRJQNP","download_json":"https://pith.science/pith/4AFE7FBJV5UBPYKKVLIWRRJQNP.json","view_paper":"https://pith.science/paper/4AFE7FBJ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1801.08099&json=true","fetch_graph":"https://pith.science/api/pith-number/4AFE7FBJV5UBPYKKVLIWRRJQNP/graph.json","fetch_events":"https://pith.science/api/pith-number/4AFE7FBJV5UBPYKKVLIWRRJQNP/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4AFE7FBJV5UBPYKKVLIWRRJQNP/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4AFE7FBJV5UBPYKKVLIWRRJQNP/action/storage_attestation","attest_author":"https://pith.science/pith/4AFE7FBJV5UBPYKKVLIWRRJQNP/action/author_attestation","sign_citation":"https://pith.science/pith/4AFE7FBJV5UBPYKKVLIWRRJQNP/action/citation_signature","submit_replication":"https://pith.science/pith/4AFE7FBJV5UBPYKKVLIWRRJQNP/action/replication_record"}},"created_at":"2026-05-17T23:53:51.757615+00:00","updated_at":"2026-05-17T23:53:51.757615+00:00"}