{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:RIN4LFODOD6LAI57IADUKMTMY2","short_pith_number":"pith:RIN4LFOD","canonical_record":{"source":{"id":"2303.17491","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-03-30T16:01:52Z","cross_cats_sorted":["cs.AI","cs.HC","cs.LG"],"title_canon_sha256":"3052200996bccbdd3aea210ba8a81dc343126e1d0a133066f47e02fa852b0ed5","abstract_canon_sha256":"69132e31a7eb5356f32a2e7c61b3915b3437ad300418ab3fa4792d45de0579dc"},"schema_version":"1.0"},"canonical_sha256":"8a1bc595c370fcb023bf400745326cc699c5b84031d15b20656b81b207f550dc","source":{"kind":"arxiv","id":"2303.17491","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2303.17491","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2303.17491v3","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2303.17491","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"RIN4LFODOD6L","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"RIN4LFODOD6LAI57","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"RIN4LFOD","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:RIN4LFODOD6LAI57IADUKMTMY2","target":"record","payload":{"canonical_record":{"source":{"id":"2303.17491","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-03-30T16:01:52Z","cross_cats_sorted":["cs.AI","cs.HC","cs.LG"],"title_canon_sha256":"3052200996bccbdd3aea210ba8a81dc343126e1d0a133066f47e02fa852b0ed5","abstract_canon_sha256":"69132e31a7eb5356f32a2e7c61b3915b3437ad300418ab3fa4792d45de0579dc"},"schema_version":"1.0"},"canonical_sha256":"8a1bc595c370fcb023bf400745326cc699c5b84031d15b20656b81b207f550dc","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:14.140197Z","signature_b64":"oBJnyI5EqkyBsyydTAcnShFKcWxiIDwsxofWKcO+gMoRmYxxM0NC+8mPRwiUQ5RlO9kh7DjWYSx+0ey66/8FCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8a1bc595c370fcb023bf400745326cc699c5b84031d15b20656b81b207f550dc","last_reissued_at":"2026-05-17T23:38:14.139474Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:14.139474Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2303.17491","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"r9t+f1wkCaQNncT0vs+S8ngRLdiEpuCWV/n+zw1b8EmjVB1vXc2VU6TkPoHXuPoLqOuhMMnqREkRtIr4zOOiAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T19:45:35.364119Z"},"content_sha256":"8a80278db29a897e694428794319fa47defce99e4474d700f573048fd06d858c","schema_version":"1.0","event_id":"sha256:8a80278db29a897e694428794319fa47defce99e4474d700f573048fd06d858c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:RIN4LFODOD6LAI57IADUKMTMY2","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Language Models can Solve Computer Tasks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Pre-trained language models solve novel computer tasks by recursively criticizing and improving their own outputs.","cross_cats":["cs.AI","cs.HC","cs.LG"],"primary_cat":"cs.CL","authors_text":"Geunwoo Kim, Pierre Baldi, Stephen McAleer","submitted_at":"2023-03-30T16:01:52Z","abstract_excerpt":"Agents capable of carrying out general tasks on a computer can improve efficiency and productivity by automating repetitive tasks and assisting in complex problem-solving. Ideally, such agents should be able to solve new computer tasks presented to them through natural language commands. However, previous approaches to this problem require large amounts of expert demonstrations and task-specific reward functions, both of which are impractical for new tasks. In this work, we show that a pre-trained large language model (LLM) agent can execute computer tasks guided by natural language using a si"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"RCI with the InstructGPT-3+RLHF LLM is state-of-the-art on MiniWoB++, using only a handful of demonstrations per task rather than tens of thousands, and without a task-specific reward function.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the pre-trained LLM already contains sufficient world knowledge and self-critique capability to generate and iteratively refine correct computer actions for novel tasks when given only a few demonstrations and a simple prompting template.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Pre-trained LLMs using recursive criticism and improvement prompting achieve state-of-the-art results on the MiniWoB++ computer task benchmark with only a handful of demonstrations and no task-specific reward function.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Pre-trained language models solve novel computer tasks by recursively criticizing and improving their own outputs.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"9b0556daa55dfa0a6e65c238492af6a8c225eb30764b8d14b8fed3fd1b0521be"},"source":{"id":"2303.17491","kind":"arxiv","version":3},"verdict":{"id":"1241c1f4-e686-44a2-ac3b-ba39bb44abba","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T12:11:24.124658Z","strongest_claim":"RCI with the InstructGPT-3+RLHF LLM is state-of-the-art on MiniWoB++, using only a handful of demonstrations per task rather than tens of thousands, and without a task-specific reward function.","one_line_summary":"Pre-trained LLMs using recursive criticism and improvement prompting achieve state-of-the-art results on the MiniWoB++ computer task benchmark with only a handful of demonstrations and no task-specific reward function.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the pre-trained LLM already contains sufficient world knowledge and self-critique capability to generate and iteratively refine correct computer actions for novel tasks when given only a few demonstrations and a simple prompting template.","pith_extraction_headline":"Pre-trained language models solve novel computer tasks by recursively criticizing and improving their own outputs."},"references":{"count":102,"sample":[{"doi":"","year":2022,"title":"Do As I Can, Not As I Say: Grounding Language in Robotic Affordances","work_id":"037320f1-b0a9-4cbe-a639-bfb25409ce71","ref_index":1,"cited_arxiv_id":"2204.01691","is_internal_anchor":true},{"doi":"","year":2022,"title":"Flamingo: a visual language model for few-shot learning","work_id":"906c9ca4-12b9-4ec9-ba04-d0bad471451d","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Constitutional AI: Harmlessness from AI Feedback","work_id":"faaaa4e0-2676-4fac-a0b4-99aef10d2095","ref_index":3,"cited_arxiv_id":"2212.08073","is_internal_anchor":true},{"doi":"","year":2022,"title":"Video pretraining (vpt): Learning to act by watching unlabeled online videos","work_id":"8791fba2-3b4c-4358-bba5-ae454e771d6d","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1901,"title":"Language models are few-shot learners","work_id":"06921215-168b-4266-a8bd-53d84ad473f0","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":102,"snapshot_sha256":"dba6a23200abf7752bbe4c2a1b607dfec157ee1c6c389dfc2fa1ee3e3df8363e","internal_anchors":19},"formal_canon":{"evidence_count":1,"snapshot_sha256":"08f158b359c71aeb903bd6d4069871459bc25234c403aab33639b5381de32531"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"1241c1f4-e686-44a2-ac3b-ba39bb44abba"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"HsvT0RDoXJ7kVR7Ol0WXxxkpZ3wJ7yUvSTJFVZBpA5mg9Ph6slRxxcXTvDlcv5j7Ob4PxCQ2q3Tpfy2+cw/ABg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T19:45:35.365114Z"},"content_sha256":"4440dbf6b90d8a5348b73d9babf448622d23c2d6c94a600fd01331af1d5e3a7f","schema_version":"1.0","event_id":"sha256:4440dbf6b90d8a5348b73d9babf448622d23c2d6c94a600fd01331af1d5e3a7f"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/RIN4LFODOD6LAI57IADUKMTMY2/bundle.json","state_url":"https://pith.science/pith/RIN4LFODOD6LAI57IADUKMTMY2/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/RIN4LFODOD6LAI57IADUKMTMY2/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-18T19:45:35Z","links":{"resolver":"https://pith.science/pith/RIN4LFODOD6LAI57IADUKMTMY2","bundle":"https://pith.science/pith/RIN4LFODOD6LAI57IADUKMTMY2/bundle.json","state":"https://pith.science/pith/RIN4LFODOD6LAI57IADUKMTMY2/state.json","well_known_bundle":"https://pith.science/.well-known/pith/RIN4LFODOD6LAI57IADUKMTMY2/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:RIN4LFODOD6LAI57IADUKMTMY2","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"69132e31a7eb5356f32a2e7c61b3915b3437ad300418ab3fa4792d45de0579dc","cross_cats_sorted":["cs.AI","cs.HC","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-03-30T16:01:52Z","title_canon_sha256":"3052200996bccbdd3aea210ba8a81dc343126e1d0a133066f47e02fa852b0ed5"},"schema_version":"1.0","source":{"id":"2303.17491","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2303.17491","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2303.17491v3","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2303.17491","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"RIN4LFODOD6L","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"RIN4LFODOD6LAI57","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"RIN4LFOD","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:4440dbf6b90d8a5348b73d9babf448622d23c2d6c94a600fd01331af1d5e3a7f","target":"graph","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"RCI with the InstructGPT-3+RLHF LLM is state-of-the-art on MiniWoB++, using only a handful of demonstrations per task rather than tens of thousands, and without a task-specific reward function."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the pre-trained LLM already contains sufficient world knowledge and self-critique capability to generate and iteratively refine correct computer actions for novel tasks when given only a few demonstrations and a simple prompting template."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Pre-trained LLMs using recursive criticism and improvement prompting achieve state-of-the-art results on the MiniWoB++ computer task benchmark with only a handful of demonstrations and no task-specific reward function."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Pre-trained language models solve novel computer tasks by recursively criticizing and improving their own outputs."}],"snapshot_sha256":"9b0556daa55dfa0a6e65c238492af6a8c225eb30764b8d14b8fed3fd1b0521be"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"08f158b359c71aeb903bd6d4069871459bc25234c403aab33639b5381de32531"},"paper":{"abstract_excerpt":"Agents capable of carrying out general tasks on a computer can improve efficiency and productivity by automating repetitive tasks and assisting in complex problem-solving. Ideally, such agents should be able to solve new computer tasks presented to them through natural language commands. However, previous approaches to this problem require large amounts of expert demonstrations and task-specific reward functions, both of which are impractical for new tasks. In this work, we show that a pre-trained large language model (LLM) agent can execute computer tasks guided by natural language using a si","authors_text":"Geunwoo Kim, Pierre Baldi, Stephen McAleer","cross_cats":["cs.AI","cs.HC","cs.LG"],"headline":"Pre-trained language models solve novel computer tasks by recursively criticizing and improving their own outputs.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-03-30T16:01:52Z","title":"Language Models can Solve Computer Tasks"},"references":{"count":102,"internal_anchors":19,"resolved_work":102,"sample":[{"cited_arxiv_id":"2204.01691","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Do As I Can, Not As I Say: Grounding Language in Robotic Affordances","work_id":"037320f1-b0a9-4cbe-a639-bfb25409ce71","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Flamingo: a visual language model for few-shot learning","work_id":"906c9ca4-12b9-4ec9-ba04-d0bad471451d","year":2022},{"cited_arxiv_id":"2212.08073","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Constitutional AI: Harmlessness from AI Feedback","work_id":"faaaa4e0-2676-4fac-a0b4-99aef10d2095","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Video pretraining (vpt): Learning to act by watching unlabeled online videos","work_id":"8791fba2-3b4c-4358-bba5-ae454e771d6d","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Language models are few-shot learners","work_id":"06921215-168b-4266-a8bd-53d84ad473f0","year":1901}],"snapshot_sha256":"dba6a23200abf7752bbe4c2a1b607dfec157ee1c6c389dfc2fa1ee3e3df8363e"},"source":{"id":"2303.17491","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-17T12:11:24.124658Z","id":"1241c1f4-e686-44a2-ac3b-ba39bb44abba","model_set":{"reader":"grok-4.3"},"one_line_summary":"Pre-trained LLMs using recursive criticism and improvement prompting achieve state-of-the-art results on the MiniWoB++ computer task benchmark with only a handful of demonstrations and no task-specific reward function.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Pre-trained language models solve novel computer tasks by recursively criticizing and improving their own outputs.","strongest_claim":"RCI with the InstructGPT-3+RLHF LLM is state-of-the-art on MiniWoB++, using only a handful of demonstrations per task rather than tens of thousands, and without a task-specific reward function.","weakest_assumption":"That the pre-trained LLM already contains sufficient world knowledge and self-critique capability to generate and iteratively refine correct computer actions for novel tasks when given only a few demonstrations and a simple prompting template."}},"verdict_id":"1241c1f4-e686-44a2-ac3b-ba39bb44abba"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:8a80278db29a897e694428794319fa47defce99e4474d700f573048fd06d858c","target":"record","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"69132e31a7eb5356f32a2e7c61b3915b3437ad300418ab3fa4792d45de0579dc","cross_cats_sorted":["cs.AI","cs.HC","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-03-30T16:01:52Z","title_canon_sha256":"3052200996bccbdd3aea210ba8a81dc343126e1d0a133066f47e02fa852b0ed5"},"schema_version":"1.0","source":{"id":"2303.17491","kind":"arxiv","version":3}},"canonical_sha256":"8a1bc595c370fcb023bf400745326cc699c5b84031d15b20656b81b207f550dc","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"8a1bc595c370fcb023bf400745326cc699c5b84031d15b20656b81b207f550dc","first_computed_at":"2026-05-17T23:38:14.139474Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:14.139474Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"oBJnyI5EqkyBsyydTAcnShFKcWxiIDwsxofWKcO+gMoRmYxxM0NC+8mPRwiUQ5RlO9kh7DjWYSx+0ey66/8FCQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:14.140197Z","signed_message":"canonical_sha256_bytes"},"source_id":"2303.17491","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:8a80278db29a897e694428794319fa47defce99e4474d700f573048fd06d858c","sha256:4440dbf6b90d8a5348b73d9babf448622d23c2d6c94a600fd01331af1d5e3a7f"],"state_sha256":"09ce82bcb73de20b43d0574c643422d00a27f27c31c42af873530a85be4c77c1"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Q+6oMgB/CMQoXWFqGdOvU8reGaRsKbj406rC8HvB6JE0Ebh6fyo/5VQozYSQP+rGjhR7tXZ9hK5fzIKsyKaYCA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-18T19:45:35.368011Z","bundle_sha256":"f6ada896f82eb01b1f5733c86fcec194d9253fbddbeee0d533333d2cfe5a6563"}}