{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:AZMN26M3X7NL7QVCMAG24I2YIV","short_pith_number":"pith:AZMN26M3","schema_version":"1.0","canonical_sha256":"0658dd799bbfdabfc2a2600dae23584579ad8fd41608b82babd04179a54a3c8c","source":{"kind":"arxiv","id":"2602.12691","version":3},"attestation_state":"computed","paper":{"title":"ALOE: Action-Level Off-Policy Evaluation for Vision-Language-Action Model Post-Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.RO","authors_text":"Chiming Liu, Chuheng Zhang, Hecheng Wang, Lizhe Qi, Maoqing Yao, Rushuai Yang, Shuoyu Yue, Wei Shan, Xiaohan Yan, Xuan Du, Yi Chen, Yongcheng Liu, Yunlong Wang, Zhichao Wu","submitted_at":"2026-02-13T07:46:37Z","abstract_excerpt":"We study how to improve large foundation vision-language-action (VLA) systems through human-in-the-loop reinforcement learning (RL) in real-world environments. A key challenge is learning reliable value functions from heterogeneous real-world experience, as value estimation provides the primary learning signal for VLA training. In practice, replay buffers contain trajectories collected from historical policies, online rollouts, demonstrations, and intermittent human interventions. Because replay buffers mix trajectories generated by different behaviors, the observed returns can be mismatched w"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2602.12691","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.RO","submitted_at":"2026-02-13T07:46:37Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"3a3cff1768e6b6c0347b60cf5a69cea7de41644cc8bfd6233d9eecac8b894b98","abstract_canon_sha256":"b8d3168011442bffc1e0d86829bdb49f955f4e4df2702b688daa8b8677d11d89"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T02:12:46.943590Z","signature_b64":"sRFhY4GCA0U61VndAuuSscBjs/X/yT+G5iXVUU9pujxyupedGM/lU1vPcwb4BfZYzMYZen5NHKNq2sF2QReZBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0658dd799bbfdabfc2a2600dae23584579ad8fd41608b82babd04179a54a3c8c","last_reissued_at":"2026-06-23T02:12:46.943084Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T02:12:46.943084Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"ALOE: Action-Level Off-Policy Evaluation for Vision-Language-Action Model Post-Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.RO","authors_text":"Chiming Liu, Chuheng Zhang, Hecheng Wang, Lizhe Qi, Maoqing Yao, Rushuai Yang, Shuoyu Yue, Wei Shan, Xiaohan Yan, Xuan Du, Yi Chen, Yongcheng Liu, Yunlong Wang, Zhichao Wu","submitted_at":"2026-02-13T07:46:37Z","abstract_excerpt":"We study how to improve large foundation vision-language-action (VLA) systems through human-in-the-loop reinforcement learning (RL) in real-world environments. A key challenge is learning reliable value functions from heterogeneous real-world experience, as value estimation provides the primary learning signal for VLA training. In practice, replay buffers contain trajectories collected from historical policies, online rollouts, demonstrations, and intermittent human interventions. Because replay buffers mix trajectories generated by different behaviors, the observed returns can be mismatched w"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.12691","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.12691/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2602.12691","created_at":"2026-06-23T02:12:46.943144+00:00"},{"alias_kind":"arxiv_version","alias_value":"2602.12691v3","created_at":"2026-06-23T02:12:46.943144+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.12691","created_at":"2026-06-23T02:12:46.943144+00:00"},{"alias_kind":"pith_short_12","alias_value":"AZMN26M3X7NL","created_at":"2026-06-23T02:12:46.943144+00:00"},{"alias_kind":"pith_short_16","alias_value":"AZMN26M3X7NL7QVC","created_at":"2026-06-23T02:12:46.943144+00:00"},{"alias_kind":"pith_short_8","alias_value":"AZMN26M3","created_at":"2026-06-23T02:12:46.943144+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.10821","citing_title":"Unified Noise Steering for Efficient Human-Guided VLA Adaptation","ref_index":50,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/AZMN26M3X7NL7QVCMAG24I2YIV","json":"https://pith.science/pith/AZMN26M3X7NL7QVCMAG24I2YIV.json","graph_json":"https://pith.science/api/pith-number/AZMN26M3X7NL7QVCMAG24I2YIV/graph.json","events_json":"https://pith.science/api/pith-number/AZMN26M3X7NL7QVCMAG24I2YIV/events.json","paper":"https://pith.science/paper/AZMN26M3"},"agent_actions":{"view_html":"https://pith.science/pith/AZMN26M3X7NL7QVCMAG24I2YIV","download_json":"https://pith.science/pith/AZMN26M3X7NL7QVCMAG24I2YIV.json","view_paper":"https://pith.science/paper/AZMN26M3","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2602.12691&json=true","fetch_graph":"https://pith.science/api/pith-number/AZMN26M3X7NL7QVCMAG24I2YIV/graph.json","fetch_events":"https://pith.science/api/pith-number/AZMN26M3X7NL7QVCMAG24I2YIV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/AZMN26M3X7NL7QVCMAG24I2YIV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/AZMN26M3X7NL7QVCMAG24I2YIV/action/storage_attestation","attest_author":"https://pith.science/pith/AZMN26M3X7NL7QVCMAG24I2YIV/action/author_attestation","sign_citation":"https://pith.science/pith/AZMN26M3X7NL7QVCMAG24I2YIV/action/citation_signature","submit_replication":"https://pith.science/pith/AZMN26M3X7NL7QVCMAG24I2YIV/action/replication_record"}},"created_at":"2026-06-23T02:12:46.943144+00:00","updated_at":"2026-06-23T02:12:46.943144+00:00"}