{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:OUVZHD3LD3KOSP56UY2XJXM2WG","short_pith_number":"pith:OUVZHD3L","schema_version":"1.0","canonical_sha256":"752b938f6b1ed4e93fbea63574dd9ab1b25f874edfc8f44f2872e14bd2c47fc5","source":{"kind":"arxiv","id":"2401.03568","version":2},"attestation_state":"computed","paper":{"title":"Agent AI: Surveying the Horizons of Multimodal Interaction","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.HC","cs.LG"],"primary_cat":"cs.AI","authors_text":"Bidipta Sarkar, Demetri Terzopoulos, Hoi Vo, Jae Sung Park, Jianfeng Gao, Katsushi Ikeuchi, Li Fei-Fei, Naoki Wake, Qiuyuan Huang, Ran Gong, Rohan Taori, Yejin Choi, Yusuke Noda, Zane Durante","submitted_at":"2024-01-07T19:11:18Z","abstract_excerpt":"Multi-modal AI systems will likely become a ubiquitous presence in our everyday lives. A promising approach to making these systems more interactive is to embody them as agents within physical and virtual environments. At present, systems leverage existing foundation models as the basic building blocks for the creation of embodied agents. Embedding agents within such environments facilitates the ability of models to process and interpret visual and contextual data, which is critical for the creation of more sophisticated and context-aware AI systems. For example, a system that can perceive use"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2401.03568","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2024-01-07T19:11:18Z","cross_cats_sorted":["cs.HC","cs.LG"],"title_canon_sha256":"afe9b091f0c78ca9a3d8dda40b7ef12310343b4c4597713246c670108ad03448","abstract_canon_sha256":"e2a1400445d20d711cc948008d7d8e49b8020e722552124ff24b38b5e41c22e5"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T14:18:56.799004Z","signature_b64":"XOJ0BhEQSOapE1E/89cC1efbgIEeBKgBlnqdFKr6W6d1ugw+0L0Mn8l09fWSKHe58unLXbwletDpG/wTzR/yDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"752b938f6b1ed4e93fbea63574dd9ab1b25f874edfc8f44f2872e14bd2c47fc5","last_reissued_at":"2026-05-18T14:18:56.796815Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T14:18:56.796815Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Agent AI: Surveying the Horizons of Multimodal Interaction","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.HC","cs.LG"],"primary_cat":"cs.AI","authors_text":"Bidipta Sarkar, Demetri Terzopoulos, Hoi Vo, Jae Sung Park, Jianfeng Gao, Katsushi Ikeuchi, Li Fei-Fei, Naoki Wake, Qiuyuan Huang, Ran Gong, Rohan Taori, Yejin Choi, Yusuke Noda, Zane Durante","submitted_at":"2024-01-07T19:11:18Z","abstract_excerpt":"Multi-modal AI systems will likely become a ubiquitous presence in our everyday lives. A promising approach to making these systems more interactive is to embody them as agents within physical and virtual environments. At present, systems leverage existing foundation models as the basic building blocks for the creation of embodied agents. Embedding agents within such environments facilitates the ability of models to process and interpret visual and contextual data, which is critical for the creation of more sophisticated and context-aware AI systems. For example, a system that can perceive use"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2401.03568","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2401.03568","created_at":"2026-05-18T14:18:56.796927+00:00"},{"alias_kind":"arxiv_version","alias_value":"2401.03568v2","created_at":"2026-05-18T14:18:56.796927+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.03568","created_at":"2026-05-18T14:18:56.796927+00:00"},{"alias_kind":"pith_short_12","alias_value":"OUVZHD3LD3KO","created_at":"2026-05-18T14:18:56.796927+00:00"},{"alias_kind":"pith_short_16","alias_value":"OUVZHD3LD3KOSP56","created_at":"2026-05-18T14:18:56.796927+00:00"},{"alias_kind":"pith_short_8","alias_value":"OUVZHD3L","created_at":"2026-05-18T14:18:56.796927+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":18,"internal_anchor_count":18,"sample":[{"citing_arxiv_id":"2509.02547","citing_title":"The Landscape of Agentic Reinforcement Learning for LLMs: A Survey","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2509.24765","citing_title":"Semantic-Aware Logical Reasoning via a Semiotic Framework","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2510.05307","citing_title":"When Should Users Check? Modeling Confirmation Frequency inMulti-Step Agentic AI Tasks","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2510.14133","citing_title":"Formalizing the Safety, Security, and Functional Properties of Agentic AI Systems","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2512.22579","citing_title":"SANet: A Semantic-aware Agentic AI Networking Framework for Cross-layer Optimization in 6G","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2602.08392","citing_title":"ST-BiBench: Benchmarking Multi-Stream Multimodal Coordination in Bimanual Embodied Tasks for MLLMs","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2603.09643","citing_title":"MM-tau-p$^2$: Persona-Adaptive Prompting for Robust Multi-Modal Agent Evaluation in Dual-Control Settings","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2404.13501","citing_title":"A Survey on the Memory Mechanism of Large Language Model based Agents","ref_index":79,"is_internal_anchor":true},{"citing_arxiv_id":"2504.19678","citing_title":"From LLM Reasoning to Autonomous AI Agents: A Comprehensive Review","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12718","citing_title":"CHAL: Council of Hierarchical Agentic Language","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2410.23218","citing_title":"OS-ATLAS: A Foundation Action Model for Generalist GUI Agents","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2402.06196","citing_title":"Large Language Models: A Survey","ref_index":176,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08044","citing_title":"A Full-Stack Performance Evaluation Infrastructure for 3D-DRAM-based LLM Accelerators","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07773","citing_title":"Is a team only as strong as its weakest link? Quantifying the short-board effect with AI Agents","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07462","citing_title":"The Moltbook Files: A Harmless Slopocalypse or Humanity's Last Experiment","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05552","citing_title":"Context-Agent: Dynamic Discourse Trees for Non-Linear Dialogue","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05440","citing_title":"LanG -- A Governance-Aware Agentic AI Platform for Unified Security Operations","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2604.04374","citing_title":"Towards Considerate Human-Robot Coexistence: A Dual-Space Framework of Robot Design and Human Perception in Healthcare","ref_index":1,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/OUVZHD3LD3KOSP56UY2XJXM2WG","json":"https://pith.science/pith/OUVZHD3LD3KOSP56UY2XJXM2WG.json","graph_json":"https://pith.science/api/pith-number/OUVZHD3LD3KOSP56UY2XJXM2WG/graph.json","events_json":"https://pith.science/api/pith-number/OUVZHD3LD3KOSP56UY2XJXM2WG/events.json","paper":"https://pith.science/paper/OUVZHD3L"},"agent_actions":{"view_html":"https://pith.science/pith/OUVZHD3LD3KOSP56UY2XJXM2WG","download_json":"https://pith.science/pith/OUVZHD3LD3KOSP56UY2XJXM2WG.json","view_paper":"https://pith.science/paper/OUVZHD3L","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2401.03568&json=true","fetch_graph":"https://pith.science/api/pith-number/OUVZHD3LD3KOSP56UY2XJXM2WG/graph.json","fetch_events":"https://pith.science/api/pith-number/OUVZHD3LD3KOSP56UY2XJXM2WG/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/OUVZHD3LD3KOSP56UY2XJXM2WG/action/timestamp_anchor","attest_storage":"https://pith.science/pith/OUVZHD3LD3KOSP56UY2XJXM2WG/action/storage_attestation","attest_author":"https://pith.science/pith/OUVZHD3LD3KOSP56UY2XJXM2WG/action/author_attestation","sign_citation":"https://pith.science/pith/OUVZHD3LD3KOSP56UY2XJXM2WG/action/citation_signature","submit_replication":"https://pith.science/pith/OUVZHD3LD3KOSP56UY2XJXM2WG/action/replication_record"}},"created_at":"2026-05-18T14:18:56.796927+00:00","updated_at":"2026-05-18T14:18:56.796927+00:00"}