{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:UFEZKARIFIPNHMH6773EP3JCTI","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"040b1b4756ef9bc2f2dbfa4df5cf000a85c5a82ca2f54a3861f81356a0aa5c4e","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-19T10:01:39Z","title_canon_sha256":"2454d0a0cbbcd1c6815d4aac9e68115cfe1bed1301ce8fcaec3cdfa0072f2760"},"schema_version":"1.0","source":{"id":"2603.18702","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.18702","created_at":"2026-05-20T00:04:28Z"},{"alias_kind":"arxiv_version","alias_value":"2603.18702v4","created_at":"2026-05-20T00:04:28Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.18702","created_at":"2026-05-20T00:04:28Z"},{"alias_kind":"pith_short_12","alias_value":"UFEZKARIFIPN","created_at":"2026-05-20T00:04:28Z"},{"alias_kind":"pith_short_16","alias_value":"UFEZKARIFIPNHMH6","created_at":"2026-05-20T00:04:28Z"},{"alias_kind":"pith_short_8","alias_value":"UFEZKARI","created_at":"2026-05-20T00:04:28Z"}],"graph_snapshots":[{"event_id":"sha256:b100ec4ae137e592b2e46a9b0e4e124e41f8f08ac5adfacd332d4d80fe83c7d8","target":"graph","created_at":"2026-05-20T00:04:28Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Conventional greedy OPL approaches may fail to maximize the policy performance, and demonstrate that policies with superior performance must exist in limited supply settings."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That logged data from an unconstrained behavior policy can be used to learn a policy that correctly accounts for future users' relative valuations under limited supply without additional assumptions on the arrival process or reward distributions."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"OPLS is a new off-policy method for contextual bandits with limited supply that outperforms greedy approaches by prioritizing items with higher relative expected rewards for the current user."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Greedy off-policy learning is suboptimal when supply is limited, and superior policies exist that allocate items based on relative expected rewards across users."}],"snapshot_sha256":"1d80710130a17e6ff438cb447b2cd1fe51b7251fcf41e529ccac4c63e01766c8"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2603.18702/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"We study off-policy learning (OPL) in contextual bandits, which plays a key role in a wide range of real-world applications such as recommendation systems and online advertising. Typical OPL in contextual bandits assumes an unconstrained environment where a policy can select the same item infinitely. However, in many practical applications, including coupon allocation and e-commerce, limited supply constrains items through budget limits on distributed coupons or inventory restrictions on products. In these settings, greedily selecting the item with the highest expected reward for the current u","authors_text":"Bushun Kawagishi, Koichi Tanaka, Nobuyuki Shimizu, Ren Kishimoto, Yasuo Yamamoto, Yusuke Narita, Yuta Saito","cross_cats":[],"headline":"Greedy off-policy learning is suboptimal when supply is limited, and superior policies exist that allocate items based on relative expected rewards across users.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-19T10:01:39Z","title":"Off-Policy Learning with Limited Supply"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.18702","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-15T08:12:12.640979Z","id":"b7400444-3876-44a2-9180-f4ed35edeb60","model_set":{"reader":"grok-4.3"},"one_line_summary":"OPLS is a new off-policy method for contextual bandits with limited supply that outperforms greedy approaches by prioritizing items with higher relative expected rewards for the current user.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Greedy off-policy learning is suboptimal when supply is limited, and superior policies exist that allocate items based on relative expected rewards across users.","strongest_claim":"Conventional greedy OPL approaches may fail to maximize the policy performance, and demonstrate that policies with superior performance must exist in limited supply settings.","weakest_assumption":"That logged data from an unconstrained behavior policy can be used to learn a policy that correctly accounts for future users' relative valuations under limited supply without additional assumptions on the arrival process or reward distributions."}},"verdict_id":"b7400444-3876-44a2-9180-f4ed35edeb60"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:3da6a4973efdce4b40a21b0bf1fdee58931d5662106171a3ef12e4f06f7ed071","target":"record","created_at":"2026-05-20T00:04:28Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"040b1b4756ef9bc2f2dbfa4df5cf000a85c5a82ca2f54a3861f81356a0aa5c4e","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-19T10:01:39Z","title_canon_sha256":"2454d0a0cbbcd1c6815d4aac9e68115cfe1bed1301ce8fcaec3cdfa0072f2760"},"schema_version":"1.0","source":{"id":"2603.18702","kind":"arxiv","version":4}},"canonical_sha256":"a1499502282a1ed3b0fefff647ed229a1be6d1385f3f244f84809a86e5e43767","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"a1499502282a1ed3b0fefff647ed229a1be6d1385f3f244f84809a86e5e43767","first_computed_at":"2026-05-20T00:04:28.780340Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:04:28.780340Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"4H2aCmRdPhC/JDBUssXhQoNdEZGk91nQeeptYooFaL4GO253ZbmtLnMC7VHNjdfgIGGIvB5wZYgqKXZ0KBUiCw==","signature_status":"signed_v1","signed_at":"2026-05-20T00:04:28.781309Z","signed_message":"canonical_sha256_bytes"},"source_id":"2603.18702","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:3da6a4973efdce4b40a21b0bf1fdee58931d5662106171a3ef12e4f06f7ed071","sha256:b100ec4ae137e592b2e46a9b0e4e124e41f8f08ac5adfacd332d4d80fe83c7d8"],"state_sha256":"de6a9016b6924155efaa11bc482d1e87709db69e1f6b6ce71beba9770084a9ad"}