{"work":{"id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","openalex_id":null,"doi":null,"arxiv_id":"2410.21276","raw_key":null,"title":"GPT-4o System Card","authors":null,"authors_text":"Gpt-4o system card , author=","year":2024,"venue":"cs.CL","abstract":"GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o's capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we've implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o's text and vision capabilities.","external_url":"https://arxiv.org/abs/2410.21276","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-29T10:23:17.971564+00:00","pith_arxiv_id":"2410.21276","created_at":"2026-05-09T05:45:22.869369+00:00","updated_at":"2026-06-29T10:23:17.971564+00:00","title_quality_ok":false,"display_title":"GPT-4o System Card","render_title":"GPT-4o System Card"},"hub":{"state":{"work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":764,"external_cited_by_count":null,"distinct_field_count":30,"first_pith_cited_at":"2024-06-27T16:47:42+00:00","last_pith_cited_at":"2026-06-24T08:37:38+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-29T12:28:45.687969+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":97},{"context_role":"baseline","n":51},{"context_role":"method","n":23},{"context_role":"dataset","n":3}],"polarity_counts":[{"context_polarity":"background","n":93},{"context_polarity":"baseline","n":51},{"context_polarity":"use_method","n":22},{"context_polarity":"unclear","n":4},{"context_polarity":"use_dataset","n":3},{"context_polarity":"support","n":1}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"GPT-4o System Card","claims":[{"claim_text":"GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GPT-4o System Card because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:30:04.240015+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"a862132e-ac43-4904-b6a0-646800d60539","orcid":null,"display_name":"Gpt-4o system card"},{"id":"fe4d5dbf-e369-4296-8f93-544d5ed81b09","orcid":null,"display_name":"author="}]},"error":null,"updated_at":"2026-05-14T18:30:04.236042+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T18:30:16.697015+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":87},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":85},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":81},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":76},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":68},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":56},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":55},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":50},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":50},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":45},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":44},{"title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","work_id":"80e3e977-f1bb-4c83-8d0c-1ab0a0c5c3f1","shared_citers":41},{"title":"LLaVA-OneVision: Easy Visual Task Transfer","work_id":"f5f2452b-f2a9-49ac-b38d-c76e18cdfe49","shared_citers":41},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":33},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":32},{"title":"InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models","work_id":"fe8637aa-12bc-4434-8d36-9f57b5eebcbe","shared_citers":28},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":28},{"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","shared_citers":27},{"title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","work_id":"b8f5e260-fff5-444e-bcf5-2c42cfefd83d","shared_citers":26},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":23},{"title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","work_id":"ee70bdc8-4656-4849-ada7-ce42a2278d70","shared_citers":22},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":20},{"title":"Qwen2.5-Omni Technical Report","work_id":"438f105c-fa9b-44aa-ad52-43acb8045cda","shared_citers":19},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":16}],"time_series":[{"n":1,"year":2024},{"n":11,"year":2025},{"n":348,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T18:29:20.635728+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T18:29:41.613927+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"GPT-4o System Card","claims":[{"claim_text":"GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GPT-4o System Card because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:29:36.980258+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"GPT-4o System Card","claims":[{"claim_text":"GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GPT-4o System Card because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:29:51.026688+00:00"}},"summary":{"title":"GPT-4o System Card","claims":[{"claim_text":"GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GPT-4o System Card because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":87},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":85},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":81},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":76},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":68},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":56},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":55},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":50},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":50},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":45},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":44},{"title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","work_id":"80e3e977-f1bb-4c83-8d0c-1ab0a0c5c3f1","shared_citers":41},{"title":"LLaVA-OneVision: Easy Visual Task Transfer","work_id":"f5f2452b-f2a9-49ac-b38d-c76e18cdfe49","shared_citers":41},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":33},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":32},{"title":"InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models","work_id":"fe8637aa-12bc-4434-8d36-9f57b5eebcbe","shared_citers":28},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":28},{"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","shared_citers":27},{"title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","work_id":"b8f5e260-fff5-444e-bcf5-2c42cfefd83d","shared_citers":26},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":23},{"title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","work_id":"ee70bdc8-4656-4849-ada7-ce42a2278d70","shared_citers":22},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":20},{"title":"Qwen2.5-Omni Technical Report","work_id":"438f105c-fa9b-44aa-ad52-43acb8045cda","shared_citers":19},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":16}],"time_series":[{"n":1,"year":2024},{"n":11,"year":2025},{"n":348,"year":2026}],"dependency_candidates":[]},"authors":[{"id":"fe4d5dbf-e369-4296-8f93-544d5ed81b09","orcid":null,"display_name":"author=","source":"manual","import_confidence":0.72},{"id":"a862132e-ac43-4904-b6a0-646800d60539","orcid":null,"display_name":"Gpt-4o system card","source":"manual","import_confidence":0.72}]}}