{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2022:GTXFFVUW5IAGGYDDGCDKIZJSH5","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"2e12f3f0f536b483f3c663217324ecee07bd9809f73caf8399664ec0c02148d6","cross_cats_sorted":["cs.AI","cs.CV","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2022-03-23T17:55:09Z","title_canon_sha256":"923da15624e8a8f8846da5becb700e1523a5a0058db29deff7fbab2dd3adcab3"},"schema_version":"1.0","source":{"id":"2203.12601","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2203.12601","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"arxiv_version","alias_value":"2203.12601v3","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2203.12601","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"pith_short_12","alias_value":"GTXFFVUW5IAG","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"GTXFFVUW5IAGGYDD","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"GTXFFVUW","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:f3201a437f21b1d80e98e1b4a396cb15405f33f635e13e7ddf4b38fde1216b73","target":"graph","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Across a suite of 12 simulated robot manipulation tasks, we find that R3M improves task success by over 20% compared to training from scratch and by over 10% compared to state-of-the-art visual representations like CLIP and MoCo. Furthermore, R3M enables a Franka Emika Panda arm to learn a range of manipulation tasks in a real, cluttered apartment given just 20 demonstrations."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That visual features learned from human video data will transfer effectively to robotic camera inputs and task distributions without any robot-specific fine-tuning or domain adaptation."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"A visual encoder pre-trained on diverse human videos with contrastive and language objectives improves simulated robot manipulation success by over 20% versus training from scratch and enables real Franka arm tasks from 20 demonstrations."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Pre-trained visual features from human videos enable more data-efficient robot manipulation."}],"snapshot_sha256":"e69e6b87e1dfd8cc4373ea87e5817d96964cd6194efccc7ee792bde0193223b1"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"eac073ec20631c456b36b01f0bccbdae9fe60d5fe64d39c5f6a3f861ed095222"},"paper":{"abstract_excerpt":"We study how visual representations pre-trained on diverse human video data can enable data-efficient learning of downstream robotic manipulation tasks. Concretely, we pre-train a visual representation using the Ego4D human video dataset using a combination of time-contrastive learning, video-language alignment, and an L1 penalty to encourage sparse and compact representations. The resulting representation, R3M, can be used as a frozen perception module for downstream policy learning. Across a suite of 12 simulated robot manipulation tasks, we find that R3M improves task success by over 20% co","authors_text":"Abhinav Gupta, Aravind Rajeswaran, Chelsea Finn, Suraj Nair, Vikash Kumar","cross_cats":["cs.AI","cs.CV","cs.LG"],"headline":"Pre-trained visual features from human videos enable more data-efficient robot manipulation.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2022-03-23T17:55:09Z","title":"R3M: A Universal Visual Representation for Robot Manipulation"},"references":{"count":71,"internal_anchors":8,"resolved_work":71,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"S. Levine, C. Finn, T. Darrell, and P. Abbeel. End-to-end training of deep visuomotor policies. The Journal of Machine Learning Research, 17(1):1334–1373, 2016","work_id":"25f290d1-bc32-4fac-8425-cbd617acd5d7","year":2016},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"J. Deng, W. Dong, R. Socher, L.-J. Li, K. Li, and L. Fei-Fei. ImageNet: A Large-Scale Hierarchical Image Database. In CVPR09, 2009","work_id":"cbccb039-8471-40e8-8444-986485f10316","year":2009},{"cited_arxiv_id":"","doi":"10.1038/s41598-020-76670-6","is_internal_anchor":false,"ref_index":3,"title":"D. Mzurikwao, M. Khan, O. Samuel, J. Cinatl, M. Wass, M. Michaelis, G. Marcelli, and C. S. Ang. Towards image-based cancer cell lines authentication using deep neural networks. Scientiﬁc Reports, 10, ","work_id":"7304f89e-02c3-4b7a-9453-82d7a0e149bf","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"J. Devlin, M.-W. Chang, K. Lee, and K. Toutanova. BERT: Pre-training of deep bidirectional transformers for language understanding. In Conference of the North American Chapter of the Association for C","work_id":"dd528a0c-a50f-417f-a715-b88d7b97c800","year":2019},{"cited_arxiv_id":"","doi":"10.18653/v1/","is_internal_anchor":false,"ref_index":5,"title":"doi: 10.18653/v1/ 2024.findings-acl.586","work_id":"8d675bdd-79ca-48d6-9163-fc17ce0e8ece","year":2020}],"snapshot_sha256":"67f00b3640739d95e32292336d2ffa0a1645b6c4ad7966a3227435c05d64b82f"},"source":{"id":"2203.12601","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-15T13:21:26.105969Z","id":"40bb61ea-d550-45b9-99c3-e36d7d935b6d","model_set":{"reader":"grok-4.3"},"one_line_summary":"A visual encoder pre-trained on diverse human videos with contrastive and language objectives improves simulated robot manipulation success by over 20% versus training from scratch and enables real Franka arm tasks from 20 demonstrations.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Pre-trained visual features from human videos enable more data-efficient robot manipulation.","strongest_claim":"Across a suite of 12 simulated robot manipulation tasks, we find that R3M improves task success by over 20% compared to training from scratch and by over 10% compared to state-of-the-art visual representations like CLIP and MoCo. Furthermore, R3M enables a Franka Emika Panda arm to learn a range of manipulation tasks in a real, cluttered apartment given just 20 demonstrations.","weakest_assumption":"That visual features learned from human video data will transfer effectively to robotic camera inputs and task distributions without any robot-specific fine-tuning or domain adaptation."}},"verdict_id":"40bb61ea-d550-45b9-99c3-e36d7d935b6d"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:bb4316631ca990d760a3819cd6e61299672ac6714440b4a5efca6e5ae2248bef","target":"record","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"2e12f3f0f536b483f3c663217324ecee07bd9809f73caf8399664ec0c02148d6","cross_cats_sorted":["cs.AI","cs.CV","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2022-03-23T17:55:09Z","title_canon_sha256":"923da15624e8a8f8846da5becb700e1523a5a0058db29deff7fbab2dd3adcab3"},"schema_version":"1.0","source":{"id":"2203.12601","kind":"arxiv","version":3}},"canonical_sha256":"34ee52d696ea006360633086a465323f42504e698f7a5184bb4ac4f8e17a2bd4","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"34ee52d696ea006360633086a465323f42504e698f7a5184bb4ac4f8e17a2bd4","first_computed_at":"2026-05-17T23:38:52.454622Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:52.454622Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"9kA08c3lkE/hJ8GR57TIvG1ouKibqBqXIVxYZmyKSWqjzGvZsN6BGOz0CC8eUvP5IgusygfkvnsN8/zE1paUAQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:52.455138Z","signed_message":"canonical_sha256_bytes"},"source_id":"2203.12601","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:bb4316631ca990d760a3819cd6e61299672ac6714440b4a5efca6e5ae2248bef","sha256:f3201a437f21b1d80e98e1b4a396cb15405f33f635e13e7ddf4b38fde1216b73"],"state_sha256":"c81213848b938ae198ba9f45aaa8eadf749b196bbf262ff55904b35b3f41993f"}