{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:SZRBHSLRNVZDJUEZMF5UBKC4W5","short_pith_number":"pith:SZRBHSLR","canonical_record":{"source":{"id":"2305.03726","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-05-05T17:59:46Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"7f1ada7a3f996e919f83d304f27b98700115314307b314a7a04bb90566b62030","abstract_canon_sha256":"f66dfa86f6dda71fb473e6f79af6934e8e44dc9ecb4cee8807ff533506874aec"},"schema_version":"1.0"},"canonical_sha256":"966213c9716d7234d099617b40a85cb77984fc3acbebb3591451c6e67aa9b5b8","source":{"kind":"arxiv","id":"2305.03726","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2305.03726","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2305.03726v2","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2305.03726","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"SZRBHSLRNVZD","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"SZRBHSLRNVZDJUEZ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"SZRBHSLR","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:SZRBHSLRNVZDJUEZMF5UBKC4W5","target":"record","payload":{"canonical_record":{"source":{"id":"2305.03726","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-05-05T17:59:46Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"7f1ada7a3f996e919f83d304f27b98700115314307b314a7a04bb90566b62030","abstract_canon_sha256":"f66dfa86f6dda71fb473e6f79af6934e8e44dc9ecb4cee8807ff533506874aec"},"schema_version":"1.0"},"canonical_sha256":"966213c9716d7234d099617b40a85cb77984fc3acbebb3591451c6e67aa9b5b8","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.781728Z","signature_b64":"eMjFiTgs12kZGeXvSJlbD6Nv1vtjYyOdEbx8Xrkch9QgR0CF/0qY+qRkgA4yZAlJ8o+zjEwdV2B0Y+w7gzxmDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"966213c9716d7234d099617b40a85cb77984fc3acbebb3591451c6e67aa9b5b8","last_reissued_at":"2026-05-17T23:38:53.781237Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.781237Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2305.03726","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"OumPEVKoBL2E+E91mRH2SlPzaBHKc5VWKSoDNvPGhK+un7X5OLfacChMByQQe4WHGPRhtalEPLrYtoSn9EI2Bg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T17:43:07.302131Z"},"content_sha256":"38d84c6f0d2b3f5842fd0196e902e193b829f4608c4ef99ba128e92f896f769a","schema_version":"1.0","event_id":"sha256:38d84c6f0d2b3f5842fd0196e902e193b829f4608c4ef99ba128e92f896f769a"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:SZRBHSLRNVZDJUEZMF5UBKC4W5","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Otter: A Multi-Modal Model with In-Context Instruction Tuning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Otter improves multi-modal instruction following by training on in-context examples from both text and images or videos.","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Bo Li, Fanyi Pu, Jinghao Wang, Jingkang Yang, Joshua Adrian Cahyono, Liangyu Chen, Yuanhan Zhang, Ziwei Liu","submitted_at":"2023-05-05T17:59:46Z","abstract_excerpt":"Recent advances in Large Multimodal Models (LMMs) have unveiled great potential as visual assistants. However, most existing works focus on responding to individual instructions or using previous dialogues for contextual understanding. There is little discussion on employing both images and text as in-context examples to enhance the instruction following capability.\n  To bridge this gap, we introduce the \\textbf{Otter} model to leverage both textual and visual in-context examples for instruction tuning. Specifically, Otter builds upon Flamingo with Perceiver architecture, and has been instruct"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"instruction tuning with these in-context examples substantially enhances model convergence and generalization capabilities. Notably, the extensive scenario coverage provided by the MIMIC-IT dataset empowers the Otter model to excel in tasks involving complex video and multi-image understanding.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the MIMIC-IT dataset's curation of diverse in-context examples across images and videos produces genuine generalization gains rather than dataset-specific improvements, and that the base Flamingo Perceiver architecture seamlessly supports the added multi-modal in-context inputs without hidden limitations.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Otter is a multi-modal model instruction-tuned on the MIMIC-IT dataset of over 3 million in-context instruction-response pairs to improve convergence and generalization on tasks with multiple images and videos.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Otter improves multi-modal instruction following by training on in-context examples from both text and images or videos.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b33bb1d522d9f2c58fe10ea2b497e82c8fde62136b7e893a1742fa13ee0fa660"},"source":{"id":"2305.03726","kind":"arxiv","version":2},"verdict":{"id":"7d51ca55-25cd-4603-b554-e9732f3645ac","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T02:40:09.150748Z","strongest_claim":"instruction tuning with these in-context examples substantially enhances model convergence and generalization capabilities. Notably, the extensive scenario coverage provided by the MIMIC-IT dataset empowers the Otter model to excel in tasks involving complex video and multi-image understanding.","one_line_summary":"Otter is a multi-modal model instruction-tuned on the MIMIC-IT dataset of over 3 million in-context instruction-response pairs to improve convergence and generalization on tasks with multiple images and videos.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the MIMIC-IT dataset's curation of diverse in-context examples across images and videos produces genuine generalization gains rather than dataset-specific improvements, and that the base Flamingo Perceiver architecture seamlessly supports the added multi-modal in-context inputs without hidden limitations.","pith_extraction_headline":"Otter improves multi-modal instruction following by training on in-context examples from both text and images or videos."},"references":{"count":104,"sample":[{"doi":"","year":2023,"title":"https://commoncrawl.org/","work_id":"eec7545e-c896-4ba3-8e13-6b19deb355f9","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"What learning algorithm is in-context learning? Investigations with linear models","work_id":"c7ff11dd-6785-4052-a878-ceb418d6f000","ref_index":2,"cited_arxiv_id":"2211.15661","is_internal_anchor":true},{"doi":"","year":2022,"title":"Flamingo: a visual language model for few-shot learning","work_id":"80bfdb3e-04fe-4388-9591-7b8e6f9665a0","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Flamingo: a visual language model for few-shot learning","work_id":"bc11415c-9fcc-43cb-862c-c2b57acb82e5","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2015,"title":"Vqa: Visual question answering","work_id":"752d0e17-6dc9-4e26-8e28-8f32abff46ed","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":104,"snapshot_sha256":"bb3474d6af19c9157873aba7f10d5f51bceaea826f6f12a04c018a6b61ff6a58","internal_anchors":30},"formal_canon":{"evidence_count":2,"snapshot_sha256":"3cc461b4eaae413be7fbf6729bfeace250673f643c90a82d275852facc34917b"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"7d51ca55-25cd-4603-b554-e9732f3645ac"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Y4IAkHnJCJs8iv4h3yyyiGodCC7GMrXul0sfRE8OE+eOYtmqBtNXSXnUibXBGh6X/JvU9A0Fc0SCcJiw/I3gBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T17:43:07.303221Z"},"content_sha256":"f3a5b0c1867b074bf63961795bf6bd58ed667a9a83b3240a5bf9d992ae3e36ab","schema_version":"1.0","event_id":"sha256:f3a5b0c1867b074bf63961795bf6bd58ed667a9a83b3240a5bf9d992ae3e36ab"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/SZRBHSLRNVZDJUEZMF5UBKC4W5/bundle.json","state_url":"https://pith.science/pith/SZRBHSLRNVZDJUEZMF5UBKC4W5/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/SZRBHSLRNVZDJUEZMF5UBKC4W5/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T17:43:07Z","links":{"resolver":"https://pith.science/pith/SZRBHSLRNVZDJUEZMF5UBKC4W5","bundle":"https://pith.science/pith/SZRBHSLRNVZDJUEZMF5UBKC4W5/bundle.json","state":"https://pith.science/pith/SZRBHSLRNVZDJUEZMF5UBKC4W5/state.json","well_known_bundle":"https://pith.science/.well-known/pith/SZRBHSLRNVZDJUEZMF5UBKC4W5/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:SZRBHSLRNVZDJUEZMF5UBKC4W5","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f66dfa86f6dda71fb473e6f79af6934e8e44dc9ecb4cee8807ff533506874aec","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-05-05T17:59:46Z","title_canon_sha256":"7f1ada7a3f996e919f83d304f27b98700115314307b314a7a04bb90566b62030"},"schema_version":"1.0","source":{"id":"2305.03726","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2305.03726","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2305.03726v2","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2305.03726","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"SZRBHSLRNVZD","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"SZRBHSLRNVZDJUEZ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"SZRBHSLR","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:f3a5b0c1867b074bf63961795bf6bd58ed667a9a83b3240a5bf9d992ae3e36ab","target":"graph","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"instruction tuning with these in-context examples substantially enhances model convergence and generalization capabilities. Notably, the extensive scenario coverage provided by the MIMIC-IT dataset empowers the Otter model to excel in tasks involving complex video and multi-image understanding."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the MIMIC-IT dataset's curation of diverse in-context examples across images and videos produces genuine generalization gains rather than dataset-specific improvements, and that the base Flamingo Perceiver architecture seamlessly supports the added multi-modal in-context inputs without hidden limitations."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Otter is a multi-modal model instruction-tuned on the MIMIC-IT dataset of over 3 million in-context instruction-response pairs to improve convergence and generalization on tasks with multiple images and videos."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Otter improves multi-modal instruction following by training on in-context examples from both text and images or videos."}],"snapshot_sha256":"b33bb1d522d9f2c58fe10ea2b497e82c8fde62136b7e893a1742fa13ee0fa660"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"3cc461b4eaae413be7fbf6729bfeace250673f643c90a82d275852facc34917b"},"paper":{"abstract_excerpt":"Recent advances in Large Multimodal Models (LMMs) have unveiled great potential as visual assistants. However, most existing works focus on responding to individual instructions or using previous dialogues for contextual understanding. There is little discussion on employing both images and text as in-context examples to enhance the instruction following capability.\n  To bridge this gap, we introduce the \\textbf{Otter} model to leverage both textual and visual in-context examples for instruction tuning. Specifically, Otter builds upon Flamingo with Perceiver architecture, and has been instruct","authors_text":"Bo Li, Fanyi Pu, Jinghao Wang, Jingkang Yang, Joshua Adrian Cahyono, Liangyu Chen, Yuanhan Zhang, Ziwei Liu","cross_cats":["cs.CL"],"headline":"Otter improves multi-modal instruction following by training on in-context examples from both text and images or videos.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-05-05T17:59:46Z","title":"Otter: A Multi-Modal Model with In-Context Instruction Tuning"},"references":{"count":104,"internal_anchors":30,"resolved_work":104,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"https://commoncrawl.org/","work_id":"eec7545e-c896-4ba3-8e13-6b19deb355f9","year":2023},{"cited_arxiv_id":"2211.15661","doi":"","is_internal_anchor":true,"ref_index":2,"title":"What learning algorithm is in-context learning? Investigations with linear models","work_id":"c7ff11dd-6785-4052-a878-ceb418d6f000","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Flamingo: a visual language model for few-shot learning","work_id":"80bfdb3e-04fe-4388-9591-7b8e6f9665a0","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Flamingo: a visual language model for few-shot learning","work_id":"bc11415c-9fcc-43cb-862c-c2b57acb82e5","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Vqa: Visual question answering","work_id":"752d0e17-6dc9-4e26-8e28-8f32abff46ed","year":2015}],"snapshot_sha256":"bb3474d6af19c9157873aba7f10d5f51bceaea826f6f12a04c018a6b61ff6a58"},"source":{"id":"2305.03726","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T02:40:09.150748Z","id":"7d51ca55-25cd-4603-b554-e9732f3645ac","model_set":{"reader":"grok-4.3"},"one_line_summary":"Otter is a multi-modal model instruction-tuned on the MIMIC-IT dataset of over 3 million in-context instruction-response pairs to improve convergence and generalization on tasks with multiple images and videos.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Otter improves multi-modal instruction following by training on in-context examples from both text and images or videos.","strongest_claim":"instruction tuning with these in-context examples substantially enhances model convergence and generalization capabilities. Notably, the extensive scenario coverage provided by the MIMIC-IT dataset empowers the Otter model to excel in tasks involving complex video and multi-image understanding.","weakest_assumption":"That the MIMIC-IT dataset's curation of diverse in-context examples across images and videos produces genuine generalization gains rather than dataset-specific improvements, and that the base Flamingo Perceiver architecture seamlessly supports the added multi-modal in-context inputs without hidden limitations."}},"verdict_id":"7d51ca55-25cd-4603-b554-e9732f3645ac"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:38d84c6f0d2b3f5842fd0196e902e193b829f4608c4ef99ba128e92f896f769a","target":"record","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f66dfa86f6dda71fb473e6f79af6934e8e44dc9ecb4cee8807ff533506874aec","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-05-05T17:59:46Z","title_canon_sha256":"7f1ada7a3f996e919f83d304f27b98700115314307b314a7a04bb90566b62030"},"schema_version":"1.0","source":{"id":"2305.03726","kind":"arxiv","version":2}},"canonical_sha256":"966213c9716d7234d099617b40a85cb77984fc3acbebb3591451c6e67aa9b5b8","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"966213c9716d7234d099617b40a85cb77984fc3acbebb3591451c6e67aa9b5b8","first_computed_at":"2026-05-17T23:38:53.781237Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:53.781237Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"eMjFiTgs12kZGeXvSJlbD6Nv1vtjYyOdEbx8Xrkch9QgR0CF/0qY+qRkgA4yZAlJ8o+zjEwdV2B0Y+w7gzxmDQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:53.781728Z","signed_message":"canonical_sha256_bytes"},"source_id":"2305.03726","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:38d84c6f0d2b3f5842fd0196e902e193b829f4608c4ef99ba128e92f896f769a","sha256:f3a5b0c1867b074bf63961795bf6bd58ed667a9a83b3240a5bf9d992ae3e36ab"],"state_sha256":"e3b623aee156b70c137f2f1cd6bbffd543bd0e21d67269518c8931b24ebe0972"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"TEBh6AouF+Ehx/Fo7UC5r1ViZpaVfKBx2dMEN4VNTzMHAOurLmZf5cyPkzGh2VlTdZgt0y9P/VPi/SwJ6pUiAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T17:43:07.308240Z","bundle_sha256":"4dbaab79f02f97e5e908bea5373a8558ddd8563104aced23d10df9136c9da6f7"}}