{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:V5YIRNYGNFDWI2MQ7WY6LLWJLI","short_pith_number":"pith:V5YIRNYG","schema_version":"1.0","canonical_sha256":"af7088b7066947646990fdb1e5aec95a02ab08f2e4375ca166101ae7eee4e640","source":{"kind":"arxiv","id":"1710.10044","version":1},"attestation_state":"computed","paper":{"title":"Distributional Reinforcement Learning with Quantile Regression","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.AI","authors_text":"Marc G. Bellemare, Mark Rowland, R\\'emi Munos, Will Dabney","submitted_at":"2017-10-27T09:35:26Z","abstract_excerpt":"In reinforcement learning an agent interacts with the environment by taking actions and observing the next state and reward. When sampled probabilistically, these state transitions, rewards, and actions can all induce randomness in the observed long-term return. Traditionally, reinforcement learning algorithms average over this randomness to estimate the value function. In this paper, we build on recent work advocating a distributional approach to reinforcement learning in which the distribution over returns is modeled explicitly instead of only estimating the mean. That is, we examine methods"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1710.10044","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2017-10-27T09:35:26Z","cross_cats_sorted":["cs.LG","stat.ML"],"title_canon_sha256":"bf8f5481e36eaefb0f3dc87d48f8a7a310f035fcbb71fa58785301adfa996a14","abstract_canon_sha256":"a49b1c4448e66180f3da24c4dd64dcd5113f40a8cef0a41bc849144d3c2663d3"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:31:54.888182Z","signature_b64":"mVV5QKc4NdQCNceCEMtwaX0R032HnyXk93FQyBvMW/8Gt/3vgMYKm/+Yr8FmxgzsV/KShRq+OxbJb0SoJhPyCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"af7088b7066947646990fdb1e5aec95a02ab08f2e4375ca166101ae7eee4e640","last_reissued_at":"2026-05-18T00:31:54.887729Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:31:54.887729Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Distributional Reinforcement Learning with Quantile Regression","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.AI","authors_text":"Marc G. Bellemare, Mark Rowland, R\\'emi Munos, Will Dabney","submitted_at":"2017-10-27T09:35:26Z","abstract_excerpt":"In reinforcement learning an agent interacts with the environment by taking actions and observing the next state and reward. When sampled probabilistically, these state transitions, rewards, and actions can all induce randomness in the observed long-term return. Traditionally, reinforcement learning algorithms average over this randomness to estimate the value function. In this paper, we build on recent work advocating a distributional approach to reinforcement learning in which the distribution over returns is modeled explicitly instead of only estimating the mean. That is, we examine methods"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1710.10044","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1710.10044","created_at":"2026-05-18T00:31:54.887800+00:00"},{"alias_kind":"arxiv_version","alias_value":"1710.10044v1","created_at":"2026-05-18T00:31:54.887800+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1710.10044","created_at":"2026-05-18T00:31:54.887800+00:00"},{"alias_kind":"pith_short_12","alias_value":"V5YIRNYGNFDW","created_at":"2026-05-18T12:31:49.984773+00:00"},{"alias_kind":"pith_short_16","alias_value":"V5YIRNYGNFDWI2MQ","created_at":"2026-05-18T12:31:49.984773+00:00"},{"alias_kind":"pith_short_8","alias_value":"V5YIRNYG","created_at":"2026-05-18T12:31:49.984773+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":5,"internal_anchor_count":4,"sample":[{"citing_arxiv_id":"1910.01708","citing_title":"Benchmarking Batch Deep Reinforcement Learning Algorithms","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2512.03847","citing_title":"DVPO: Distributional Value Modeling-based Policy Optimization for LLM Post-Training","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2603.04333","citing_title":"What Does Flow Matching Bring To TD Learning?","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2010.02193","citing_title":"Mastering Atari with Discrete World Models","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12462","citing_title":"Towards Affordable Energy: A Gymnasium Environment for Electric Utility Demand-Response Programs","ref_index":9,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/V5YIRNYGNFDWI2MQ7WY6LLWJLI","json":"https://pith.science/pith/V5YIRNYGNFDWI2MQ7WY6LLWJLI.json","graph_json":"https://pith.science/api/pith-number/V5YIRNYGNFDWI2MQ7WY6LLWJLI/graph.json","events_json":"https://pith.science/api/pith-number/V5YIRNYGNFDWI2MQ7WY6LLWJLI/events.json","paper":"https://pith.science/paper/V5YIRNYG"},"agent_actions":{"view_html":"https://pith.science/pith/V5YIRNYGNFDWI2MQ7WY6LLWJLI","download_json":"https://pith.science/pith/V5YIRNYGNFDWI2MQ7WY6LLWJLI.json","view_paper":"https://pith.science/paper/V5YIRNYG","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1710.10044&json=true","fetch_graph":"https://pith.science/api/pith-number/V5YIRNYGNFDWI2MQ7WY6LLWJLI/graph.json","fetch_events":"https://pith.science/api/pith-number/V5YIRNYGNFDWI2MQ7WY6LLWJLI/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/V5YIRNYGNFDWI2MQ7WY6LLWJLI/action/timestamp_anchor","attest_storage":"https://pith.science/pith/V5YIRNYGNFDWI2MQ7WY6LLWJLI/action/storage_attestation","attest_author":"https://pith.science/pith/V5YIRNYGNFDWI2MQ7WY6LLWJLI/action/author_attestation","sign_citation":"https://pith.science/pith/V5YIRNYGNFDWI2MQ7WY6LLWJLI/action/citation_signature","submit_replication":"https://pith.science/pith/V5YIRNYGNFDWI2MQ7WY6LLWJLI/action/replication_record"}},"created_at":"2026-05-18T00:31:54.887800+00:00","updated_at":"2026-05-18T00:31:54.887800+00:00"}