{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2021:UOOVP6KNB3N3PH6Q6NKB4P653J","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"c328ae9fb67839fb9efacc3ad0e9364bcf672c3b4a0802fcf8968ada843530e7","cross_cats_sorted":["cs.CL","cs.CV","cs.SD","eess.AS"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2021-07-30T17:53:34Z","title_canon_sha256":"51e3957195f6b7855e23f45785d8f55e645d412b477cb4d44eb01f3219901580"},"schema_version":"1.0","source":{"id":"2107.14795","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2107.14795","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2107.14795v3","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2107.14795","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"UOOVP6KNB3N3","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"UOOVP6KNB3N3PH6Q","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"UOOVP6KN","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:b20869e5218213036bb7cf31c7876f87c34cfd1e658470e30b0ce31443db50dc","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"The same architecture achieves strong results on tasks spanning natural language and visual understanding, multi-task and multi-modal reasoning, and StarCraft II. As highlights, Perceiver IO outperforms a Transformer-based BERT baseline on the GLUE language benchmark despite removing input tokenization and achieves state-of-the-art performance on Sintel optical flow estimation with no explicit mechanisms for multiscale correspondence."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the added flexible querying mechanism can produce outputs of arbitrary sizes and semantics across domains without introducing hidden task-specific assumptions or requiring per-task architectural changes that undermine the generality claim."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Perceiver IO is a general architecture that processes arbitrary structured inputs and outputs with linear scaling and achieves strong results on GLUE, Sintel optical flow, multi-task reasoning, and StarCraft II without task-specific components."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Perceiver IO adds a flexible querying mechanism to the Perceiver so one architecture processes arbitrary structured inputs and produces outputs of any size or type while scaling linearly."}],"snapshot_sha256":"96f5fabd157d02586b1607250c9aec59ae2e9087dbfbf6f42b6bf819c6df6cf1"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"83adb9dc241655ce4a3529e67ad4544ec9ccf4b3fb6fa1c1ee015b7a34a21465"},"paper":{"abstract_excerpt":"A central goal of machine learning is the development of systems that can solve many problems in as many data domains as possible. Current architectures, however, cannot be applied beyond a small set of stereotyped settings, as they bake in domain & task assumptions or scale poorly to large inputs or outputs. In this work, we propose Perceiver IO, a general-purpose architecture that handles data from arbitrary settings while scaling linearly with the size of inputs and outputs. Our model augments the Perceiver with a flexible querying mechanism that enables outputs of various sizes and semanti","authors_text":"Andrew Brock, Andrew Jaegle, Andrew Zisserman, Carl Doersch, Catalin Ionescu, Daniel Zoran, David Ding, Evan Shelhamer, Jean-Baptiste Alayrac, Jo\\=ao Carreira, Matthew M. Botvinick, Olivier H\\'enaff, Oriol Vinyals, Sebastian Borgeaud, Skanda Koppula","cross_cats":["cs.CL","cs.CV","cs.SD","eess.AS"],"headline":"Perceiver IO adds a flexible querying mechanism to the Perceiver so one architecture processes arbitrary structured inputs and produces outputs of any size or type while scaling linearly.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2021-07-30T17:53:34Z","title":"Perceiver IO: A General Architecture for Structured Inputs & Outputs"},"references":{"count":103,"internal_anchors":7,"resolved_work":103,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Imitating interactive intelligence, 2021, 2012.05672 http://arxiv.org/abs/2012.05672","work_id":"993f94c8-0c91-4ae3-9f40-68c03b5723d3","year":2012},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"VATT : Transformers for multimodal self-supervised learning from raw video, audio and text","work_id":"40471e6e-c402-41c0-9da0-b94ca27bc316","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Self-supervised multimodal versatile networks","work_id":"222413c3-71e9-4e35-afe9-7d3cb141429e","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"The D eep M ind JAX E cosystem, 2020","work_id":"c47be3c1-be1b-4120-8234-644c5355e25f","year":2020},{"cited_arxiv_id":"2004.05150","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Longformer: The Long-Document Transformer","work_id":"abea7a44-6668-4de7-aab6-f53a6e5aa088","year":2004}],"snapshot_sha256":"bc2a8846c668d4d21559628741d95fc4c8692ae1ee00a33e534a8bb0e002d42b"},"source":{"id":"2107.14795","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-15T19:43:58.764181Z","id":"c6c19aed-5b3e-443d-8004-adc57417d892","model_set":{"reader":"grok-4.3"},"one_line_summary":"Perceiver IO is a general architecture that processes arbitrary structured inputs and outputs with linear scaling and achieves strong results on GLUE, Sintel optical flow, multi-task reasoning, and StarCraft II without task-specific components.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Perceiver IO adds a flexible querying mechanism to the Perceiver so one architecture processes arbitrary structured inputs and produces outputs of any size or type while scaling linearly.","strongest_claim":"The same architecture achieves strong results on tasks spanning natural language and visual understanding, multi-task and multi-modal reasoning, and StarCraft II. As highlights, Perceiver IO outperforms a Transformer-based BERT baseline on the GLUE language benchmark despite removing input tokenization and achieves state-of-the-art performance on Sintel optical flow estimation with no explicit mechanisms for multiscale correspondence.","weakest_assumption":"That the added flexible querying mechanism can produce outputs of arbitrary sizes and semantics across domains without introducing hidden task-specific assumptions or requiring per-task architectural changes that undermine the generality claim."}},"verdict_id":"c6c19aed-5b3e-443d-8004-adc57417d892"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:076ed932e8d7f8b6524d3a15f7fd5085076e8e2f682ba435228e5175cad9bd93","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"c328ae9fb67839fb9efacc3ad0e9364bcf672c3b4a0802fcf8968ada843530e7","cross_cats_sorted":["cs.CL","cs.CV","cs.SD","eess.AS"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2021-07-30T17:53:34Z","title_canon_sha256":"51e3957195f6b7855e23f45785d8f55e645d412b477cb4d44eb01f3219901580"},"schema_version":"1.0","source":{"id":"2107.14795","kind":"arxiv","version":3}},"canonical_sha256":"a39d57f94d0edbb79fd0f3541e3fddda713de0c62bb0e3d9d235df20a1976d1a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"a39d57f94d0edbb79fd0f3541e3fddda713de0c62bb0e3d9d235df20a1976d1a","first_computed_at":"2026-05-17T23:38:50.367308Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.367308Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"SESehukw0mbbDbRxrzElronUko+joVfuLqsGcOWkLOpHEocMfT0uqrc5t7A33M2MmqX3bHE/+85lKFNLTW8fAw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.367844Z","signed_message":"canonical_sha256_bytes"},"source_id":"2107.14795","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:076ed932e8d7f8b6524d3a15f7fd5085076e8e2f682ba435228e5175cad9bd93","sha256:b20869e5218213036bb7cf31c7876f87c34cfd1e658470e30b0ce31443db50dc"],"state_sha256":"285b8f21f575fa793b86b0b32b373f8781729bd85fddeb139f2ae6119923b3ad"}