{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:WKZ3EEVGZRZV4Y7DQWLPVIQQC6","short_pith_number":"pith:WKZ3EEVG","schema_version":"1.0","canonical_sha256":"b2b3b212a6cc735e63e38596faa21017b4280494454fc66be4b151c99b025036","source":{"kind":"arxiv","id":"2506.10677","version":3},"attestation_state":"computed","paper":{"title":"Exploiting Similarities in A/B Testing with Off-Policy Estimation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"stat.ML","authors_text":"Alexandre Gilotte, David Rohde, Otmane Sakhi","submitted_at":"2025-06-12T13:11:01Z","abstract_excerpt":"We study A/B testing, the standard protocol for measuring the performance gain of a new decision system relative to a baseline. Traditional A/B testing treats both systems as black boxes, ignoring potential similarities between them. In practice, however, new and baseline systems are rarely radically different and often share significant structure, which can be captured by their propensities to make similar decisions. We show that in such cases, the commonly used difference-in-means estimator, though unbiased, is statistically suboptimal. Leveraging off-policy estimation, we introduce a family"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2506.10677","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"stat.ML","submitted_at":"2025-06-12T13:11:01Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"34e269508f6e821312b5056ad504787160b4aa9c5decb547a9731f4eade80280","abstract_canon_sha256":"2b81443b440a250cb56fb6e319d15cdf31a861b53a43d804c1a5f2bb6e53f15b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T03:04:32.925450Z","signature_b64":"xmKdVq3d9qKd+fk2aRmxpa+unksPK3vSp0QrD3oOne89Z5TOZANvCq2NF5LAnqIrLKW3n0yClw3+9QnWAkkuCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b2b3b212a6cc735e63e38596faa21017b4280494454fc66be4b151c99b025036","last_reissued_at":"2026-06-02T03:04:32.924891Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T03:04:32.924891Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Exploiting Similarities in A/B Testing with Off-Policy Estimation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"stat.ML","authors_text":"Alexandre Gilotte, David Rohde, Otmane Sakhi","submitted_at":"2025-06-12T13:11:01Z","abstract_excerpt":"We study A/B testing, the standard protocol for measuring the performance gain of a new decision system relative to a baseline. Traditional A/B testing treats both systems as black boxes, ignoring potential similarities between them. In practice, however, new and baseline systems are rarely radically different and often share significant structure, which can be captured by their propensities to make similar decisions. We show that in such cases, the commonly used difference-in-means estimator, though unbiased, is statistically suboptimal. Leveraging off-policy estimation, we introduce a family"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2506.10677","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2506.10677/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2506.10677","created_at":"2026-06-02T03:04:32.924951+00:00"},{"alias_kind":"arxiv_version","alias_value":"2506.10677v3","created_at":"2026-06-02T03:04:32.924951+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2506.10677","created_at":"2026-06-02T03:04:32.924951+00:00"},{"alias_kind":"pith_short_12","alias_value":"WKZ3EEVGZRZV","created_at":"2026-06-02T03:04:32.924951+00:00"},{"alias_kind":"pith_short_16","alias_value":"WKZ3EEVGZRZV4Y7D","created_at":"2026-06-02T03:04:32.924951+00:00"},{"alias_kind":"pith_short_8","alias_value":"WKZ3EEVG","created_at":"2026-06-02T03:04:32.924951+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2605.15108","citing_title":"Logging Policy Design for Off-Policy Evaluation","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15108","citing_title":"Logging Policy Design for Off-Policy Evaluation","ref_index":50,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WKZ3EEVGZRZV4Y7DQWLPVIQQC6","json":"https://pith.science/pith/WKZ3EEVGZRZV4Y7DQWLPVIQQC6.json","graph_json":"https://pith.science/api/pith-number/WKZ3EEVGZRZV4Y7DQWLPVIQQC6/graph.json","events_json":"https://pith.science/api/pith-number/WKZ3EEVGZRZV4Y7DQWLPVIQQC6/events.json","paper":"https://pith.science/paper/WKZ3EEVG"},"agent_actions":{"view_html":"https://pith.science/pith/WKZ3EEVGZRZV4Y7DQWLPVIQQC6","download_json":"https://pith.science/pith/WKZ3EEVGZRZV4Y7DQWLPVIQQC6.json","view_paper":"https://pith.science/paper/WKZ3EEVG","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2506.10677&json=true","fetch_graph":"https://pith.science/api/pith-number/WKZ3EEVGZRZV4Y7DQWLPVIQQC6/graph.json","fetch_events":"https://pith.science/api/pith-number/WKZ3EEVGZRZV4Y7DQWLPVIQQC6/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WKZ3EEVGZRZV4Y7DQWLPVIQQC6/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WKZ3EEVGZRZV4Y7DQWLPVIQQC6/action/storage_attestation","attest_author":"https://pith.science/pith/WKZ3EEVGZRZV4Y7DQWLPVIQQC6/action/author_attestation","sign_citation":"https://pith.science/pith/WKZ3EEVGZRZV4Y7DQWLPVIQQC6/action/citation_signature","submit_replication":"https://pith.science/pith/WKZ3EEVGZRZV4Y7DQWLPVIQQC6/action/replication_record"}},"created_at":"2026-06-02T03:04:32.924951+00:00","updated_at":"2026-06-02T03:04:32.924951+00:00"}