{"work":{"id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","openalex_id":null,"doi":null,"arxiv_id":"2410.21276","raw_key":null,"title":"GPT-4o System Card","authors":null,"authors_text":"Gpt-4o system card , author=","year":2024,"venue":"cs.CL","abstract":"GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o's capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we've implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o's text and vision capabilities.","external_url":"https://arxiv.org/abs/2410.21276","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-07-04T09:59:44.986586+00:00","pith_arxiv_id":"2410.21276","created_at":"2026-05-09T05:45:22.869369+00:00","updated_at":"2026-07-04T09:59:44.986586+00:00","title_quality_ok":false,"display_title":"GPT-4o System Card","render_title":"GPT-4o System Card"},"hub":{"state":{"work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","tier":"mega_hub","tier_reason":"1,000+ Pith inbound or 100,000+ external citations","pith_inbound_count":1013,"external_cited_by_count":null,"distinct_field_count":32,"first_pith_cited_at":"2024-06-27T16:47:42+00:00","last_pith_cited_at":"2026-07-02T16:40:08+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"needed","recognition_status":"needed","updated_at":"2026-07-04T09:56:39.652219+00:00","tier_text":"mega_hub"},"tier":"mega_hub","role_counts":[{"context_role":"background","n":98},{"context_role":"baseline","n":52},{"context_role":"method","n":23},{"context_role":"dataset","n":3}],"polarity_counts":[{"context_polarity":"background","n":94},{"context_polarity":"baseline","n":52},{"context_polarity":"use_method","n":22},{"context_polarity":"unclear","n":4},{"context_polarity":"use_dataset","n":3},{"context_polarity":"support","n":1}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"GPT-4o System Card","claims":[{"claim_text":"GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GPT-4o System Card because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:30:04.240015+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"a862132e-ac43-4904-b6a0-646800d60539","orcid":null,"display_name":"Gpt-4o system card"},{"id":"fe4d5dbf-e369-4296-8f93-544d5ed81b09","orcid":null,"display_name":"author="}]},"error":null,"updated_at":"2026-05-14T18:30:04.236042+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T18:30:16.697015+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":87},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":85},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":81},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":76},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":68},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":56},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":55},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":50},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":50},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":45},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":44},{"title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","work_id":"80e3e977-f1bb-4c83-8d0c-1ab0a0c5c3f1","shared_citers":41},{"title":"LLaVA-OneVision: Easy Visual Task Transfer","work_id":"f5f2452b-f2a9-49ac-b38d-c76e18cdfe49","shared_citers":41},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":33},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":32},{"title":"InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models","work_id":"fe8637aa-12bc-4434-8d36-9f57b5eebcbe","shared_citers":28},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":28},{"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","shared_citers":27},{"title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","work_id":"b8f5e260-fff5-444e-bcf5-2c42cfefd83d","shared_citers":26},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":23},{"title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","work_id":"ee70bdc8-4656-4849-ada7-ce42a2278d70","shared_citers":22},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":20},{"title":"Qwen2.5-Omni Technical Report","work_id":"438f105c-fa9b-44aa-ad52-43acb8045cda","shared_citers":19},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":16}],"time_series":[{"n":1,"year":2024},{"n":11,"year":2025},{"n":348,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T18:29:20.635728+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T18:29:41.613927+00:00"},"reader_index":{"job_type":"reader_index","status":"succeeded","result":{"note":"annotated reader requires full-text/OA fetch; shell is wired for mega hubs","status":"reader queued"},"error":null,"updated_at":"2026-07-04T05:57:08.587595+00:00"},"recognition_alignment":{"job_type":"recognition_alignment","status":"succeeded","result":{"modules":["IndisputableMonolith.Information.Compression","IndisputableMonolith.Physics.DarkMatterCrossSectionBandScoreCard","IndisputableMonolith.Common.CanonicalJBand","IndisputableMonolith.Physics.StandardModelGroupStructure","IndisputableMonolith.Physics.StandardModelLagrangianStructure","IndisputableMonolith.Certificates.Standard","IndisputableMonolith.NumberTheory.HilbertPolyaCandidate","IndisputableMonolith.Unification.RecognitionBandwidth"],"query_chars":1423},"error":null,"updated_at":"2026-07-04T05:57:08.581367+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"GPT-4o System Card","claims":[{"claim_text":"GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GPT-4o System Card because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:29:36.980258+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"GPT-4o System Card","claims":[{"claim_text":"GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GPT-4o System Card because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:29:51.026688+00:00"}},"summary":{"title":"GPT-4o System Card","claims":[{"claim_text":"GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GPT-4o System Card because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":87},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":85},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":81},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":76},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":68},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":56},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":55},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":50},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":50},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":45},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":44},{"title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","work_id":"80e3e977-f1bb-4c83-8d0c-1ab0a0c5c3f1","shared_citers":41},{"title":"LLaVA-OneVision: Easy Visual Task Transfer","work_id":"f5f2452b-f2a9-49ac-b38d-c76e18cdfe49","shared_citers":41},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":33},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":32},{"title":"InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models","work_id":"fe8637aa-12bc-4434-8d36-9f57b5eebcbe","shared_citers":28},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":28},{"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","shared_citers":27},{"title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","work_id":"b8f5e260-fff5-444e-bcf5-2c42cfefd83d","shared_citers":26},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":23},{"title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","work_id":"ee70bdc8-4656-4849-ada7-ce42a2278d70","shared_citers":22},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":20},{"title":"Qwen2.5-Omni Technical Report","work_id":"438f105c-fa9b-44aa-ad52-43acb8045cda","shared_citers":19},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":16}],"time_series":[{"n":1,"year":2024},{"n":11,"year":2025},{"n":348,"year":2026}],"dependency_candidates":[]},"authors":[{"id":"fe4d5dbf-e369-4296-8f93-544d5ed81b09","orcid":null,"display_name":"author=","source":"manual","import_confidence":0.72},{"id":"a862132e-ac43-4904-b6a0-646800d60539","orcid":null,"display_name":"Gpt-4o system card","source":"manual","import_confidence":0.72}]},"citers":{"total":1013,"items":[{"citing_arxiv_id":"2607.02407","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Text-Driven 3D Indoor Scene Synthesis in Non-Manhattan Environments","primary_cat":"cs.AI","submitted_at":"2026-07-02T16:40:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"SPG-Layout combines statistical object priors with hierarchical large-object-first placement to produce physically plausible text-driven 3D scenes in non-Manhattan rooms and outperforms baselines on a new 500-scene benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02096","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LongEgoRefer: A Benchmark for Long-Form Egocentric Video Referring Expression Comprehension","primary_cat":"cs.CV","submitted_at":"2026-07-02T12:32:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LongEgoRefer is a new benchmark of 1,498 referring expressions in 45-minute average egocentric videos that exposes the failure of existing Video REC models on sparse long-form spatio-temporal grounding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02045","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PWM-ArtGen: Part World Model for Articulated Object Generation","primary_cat":"cs.CV","submitted_at":"2026-07-02T11:12:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PWM-ArtGen couples action and image diffusion models for joint learning of dynamics and kinematics on a new 19.7k dataset, outperforming baselines with zero-shot generalization to out-of-distribution articulated objects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01936","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CausalSteward: An Agentic Divide-Conquer-Combine Copilot for Causal Discovery","primary_cat":"cs.MA","submitted_at":"2026-07-02T09:29:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CausalSteward is a multi-agent divide-conquer-combine framework for causal discovery that integrates prior knowledge with data-driven methods in a human-in-the-loop setup for high-dimensional data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01784","ref_index":56,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SpaceEra++: A Unified Framework Towards 3D Spatial Reasoning in Video","primary_cat":"cs.CV","submitted_at":"2026-07-02T06:56:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"SpaceEra++ adds ScenePick frame sampling and SpaceAlign pairwise constraints to the prior SpaceEra system, claiming consistent benchmark gains for 3D video spatial reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01767","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Repair the Amplifier, Not the Symptom: Stable World-Model Correction for Agent Rollouts","primary_cat":"cs.AI","submitted_at":"2026-07-02T06:31:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WM-SAR identifies and repairs causal subgraphs that amplify errors in agent planning graphs, outperforming symptom-scanning LLM correctors under token constraints.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01754","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Path-level Hindsight Instructions for Semantic Exploration in Vision-Language Navigation","primary_cat":"cs.AI","submitted_at":"2026-07-02T06:11:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Phi-Nav generates path-level hindsight instructions from on-policy exploration trajectories to supply additional semantic supervision for vision-language navigation agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01667","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Temporal and Cross-Modal Alignment for Enhanced Audiovisual Video Captioning","primary_cat":"cs.CV","submitted_at":"2026-07-02T03:47:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"TCA-Captioner introduces an Observer-Checker-Corrector refinement loop and TCA-Bench to address modality detachment and temporal incoherence in audiovisual video captioning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01658","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Teaching Vision-Language-Action Models What to See and Where to Look","primary_cat":"cs.CV","submitted_at":"2026-07-02T03:34:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DriveTeach-VLA adds Driving-aware Vision Distillation pretraining and 2D Trajectory-Guided Prompts to VLA models, then reports state-of-the-art results on NAVSIM and nuScenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01469","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Cost-Aware, Paired Protocol for Auditing Dynamic Tool Synthesis in Agentic Video Question Answering","primary_cat":"cs.CV","submitted_at":"2026-07-01T21:01:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces a cost-aware paired protocol with six outcome groups and applies it to Dynamic-SAGE versus SAGE, reporting 7.5-point accuracy gain, 28% fewer tool calls, but 34% higher token use.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01425","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agent4cs: A Multi-agent System for Code Summarization in Large Hierarchical Codebases","primary_cat":"cs.AI","submitted_at":"2026-07-01T19:41:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Agent4cs deploys summarization, keyword-extraction, and quality-assurance agents in a bottom-up pipeline that raises semantic consistency by 8% and normalized keyword coverage by up to 38% over structured prompting baselines on seven frontier models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01191","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Perceive-to-Reason: Decoupling Perception and Reasoning for Fine-Grained Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-07-01T17:24:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"P2R decouples perception from reasoning in VLMs via a two-stage process and PRA-GRPO alternating RL training, reporting gains such as 93.2% on V-Star for the 4B model over its Qwen3-VL backbone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02089","ref_index":63,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ESC: Emotional Self-Correction for Reliable Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-07-01T14:25:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ESC uses emotional cues triggered by an external verifier to enable training-free self-correction in VLMs, improving reliability on safety, hallucination, and reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00983","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"QCA: Query- and Content-Aware Keyframe Selection for Long Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-07-01T14:19:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"QCA selects compact, query-relevant keyframes from long videos via segment-wise budget allocation and diversity-aware addition, achieving higher accuracy than GPT-4o on LongVideoBench with half the frames.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00881","ref_index":63,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniView-Space: Reinforcing Spatial Reasoning via Multi-Perspective Spatial Mapping","primary_cat":"cs.CV","submitted_at":"2026-07-01T12:45:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OmniView-Space framework with MPSM, tool-guided reasoning, and distillation achieves SOTA on spatial reasoning benchmarks for MLLMs while reducing external geometry dependencies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00816","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards High-Resolution Visual Perception via Hierarchical Entity Exploration","primary_cat":"cs.CV","submitted_at":"2026-07-01T11:41:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HEE is a training-free, model-agnostic method for high-resolution visual perception in MLLMs using hierarchical entity exploration with dual scoring, detection, clustering, and backtracking.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00711","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ClarifyCodeBench: Evaluating LLMs on Clarifying Ambiguous Requirements for Code Generation","primary_cat":"cs.SE","submitted_at":"2026-07-01T09:58:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ClarifyCodeBench is a new benchmark with manual annotations and two metrics showing that LLMs strong at code generation are weak at clarifying ambiguous requirements, with performance worsening as ambiguity density rises.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00547","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EgoGapBench: Benchmarking Egocentric Action Selection in Multi-Agent Scenes","primary_cat":"cs.CV","submitted_at":"2026-07-01T07:39:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EgoGapBench shows humans reliably select egocentric actions in multi-agent scenes while MLLMs systematically choose other agents' actions, and standard egocentric training data fails to close the gap.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00333","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"(A)I Sees What You Don't: Exploiting New Attack Surfaces in Third-Party Mobile Agents","primary_cat":"cs.CR","submitted_at":"2026-07-01T02:17:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Identifies Screen Perception and Misused Channel attack surfaces in VLM-powered mobile agents and demonstrates seven attacks enabling arbitrary command execution on five frameworks without privileges.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00302","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Wake up for Touch! Mask-isolated Tactile Alignment Learning in MLLMs","primary_cat":"cs.CV","submitted_at":"2026-07-01T01:02:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Splash partitions MLLM parameters into dormant and critical subspaces via significance quantification, updating only the dormant subspace for tactile alignment while preserving general capabilities and achieving SOTA on visuo-tactile benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00115","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PixelEyes: Decoupling Perception and Reasoning for Pinpoint Visual Evidence Seeking","primary_cat":"cs.CV","submitted_at":"2026-06-30T19:51:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PixelEyes decouples reasoning and perception via mask-guided search and semantic BFS, introduces PixelEyes-6K dataset and Pinpoint-Bench benchmark, and open-sources code and models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.32033","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SpheRoPE: Zero-Shot Optimization-Free 360 Panorama Generation with Spherical RoPE","primary_cat":"cs.CV","submitted_at":"2026-06-30T17:57:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SpheRoPE modifies rotary position embeddings in diffusion transformers to enforce spherical topology for zero-shot 360 panorama generation across multiple backbones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31933","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"No Place to Hide: Benchmarking Video Hallucination with Background-Controlled Pairs","primary_cat":"cs.CV","submitted_at":"2026-06-30T16:38:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces VidPair-Halluc benchmark of 1K background-controlled adversarial video pairs and 11K QA pairs generated via PairFlow pipeline to evaluate hallucination in LVMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31919","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MVP-Nav: Multi-layer Value Map Planner Navigator","primary_cat":"cs.RO","submitted_at":"2026-06-30T16:25:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MVP-Nav reconstructs explicit 3D physical occupancy from monocular RGB using foundation models and integrates it with semantic priorities via a Multi-layer Value Map for grounded planning in zero-shot object navigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31608","ref_index":170,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CLExEval: A Human-in-the-Loop Framework for Qualitative Evaluation of LLM Clinical Reasoning","primary_cat":"cs.CL","submitted_at":"2026-06-30T12:56:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CLExEval introduces a human-annotated evaluation framework on 40 rare cases that identifies verbosity bias, hidden knowledge paradox, and 68.6% reasoning-to-output mismatch in LLMs while showing LLM-as-a-Judge overestimates reliability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31451","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniTac: A Unified Multimodal Model for Cross-Sensor Tactile Understanding and Generation","primary_cat":"cs.RO","submitted_at":"2026-06-30T10:25:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniTac is the first unified multimodal model for cross-sensor tactile understanding and generation, using dual-level representations, two new understanding tasks, and a two-stage training paradigm with sensor-prior sampling to achieve SOTA understanding and realistic cross-sensor generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31410","ref_index":14,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Xiaomi-GUI-0 Technical Report","primary_cat":"cs.AI","submitted_at":"2026-06-30T09:36:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Xiaomi-GUI-0 reports 72.0% success on RealMobile and 78.9% on AndroidWorld via real-device closed-loop training with multi-source data and three-stage RL pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31399","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"World-Model Collapse as a Phase Transition","primary_cat":"cs.AI","submitted_at":"2026-06-30T09:28:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Long-horizon language agents show phase-transition-like world-model collapse under small parameter changes, with world-state fidelity failing before action validity, as mapped by grid search in deterministic tasks with gold states.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31338","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Binary Instrument QA: Probing Instrument Grounding in Music Audio-Language Models","primary_cat":"cs.SD","submitted_at":"2026-06-30T08:39:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces an OpenMIC-derived multi-axis benchmark sequence showing that high binary instrument QA accuracy fails to predict robust grounding, with models showing position bias, confusable errors, and temporal bias.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31326","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bridging Video Understanding and Generation in a Unified Framework","primary_cat":"cs.CV","submitted_at":"2026-06-30T08:29:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Vega unifies video understanding and generation via shared vocabulary and hybrid autoregressive-diffusion architecture, reporting strong results on VBench and VideoMME.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31308","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Benchmarking Large Language Models on Floating-Point Error Classification","primary_cat":"cs.AI","submitted_at":"2026-06-30T08:18:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces InterFLOPBench benchmark and evaluates 14 LLMs on multi-label classification of six floating-point error categories in C code, with top models exceeding 0.88 overall F1 but lower scores on subtle errors like underflow.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31245","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HyperVLP: Enhancing Hierarchical Surgical Video-Language Pre-training in Hyperbolic Space","primary_cat":"cs.CV","submitted_at":"2026-06-30T07:21:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HyperVLP uses hyperbolic geometry in surgical video-language pre-training to preserve hierarchy across actions, steps, and phases, yielding gains in zero- and few-shot phase recognition.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31169","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Single Character: Evaluating MLLMs for Sentence-Level Oracle Bone Inscription Understanding","primary_cat":"cs.CV","submitted_at":"2026-06-30T05:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper introduces the S-OBI benchmark for sentence-level oracle bone inscription understanding and reports that current MLLMs remain dependent on character-level recognition due to propagating visual errors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31082","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fleet: Few Shots Lead Effective AI-generated Image Detection","primary_cat":"cs.CV","submitted_at":"2026-06-30T03:15:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fleet achieves dynamic few-shot adaptation for AIGI detection via avoidance routing in decoupled subspaces, raising accuracy from 20.4% to 73.1% on new generators like Doubao Seedream 4.0 with 10 shots.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30987","ref_index":120,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Measuring Judgment Quality in Natural-Language Explanations: Evidence from Forecasting Tournaments","primary_cat":"cs.CL","submitted_at":"2026-06-29T23:51:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EQMs, sixty LLM-scored reasoning patterns, predict forecast accuracy at both item and person levels and outperform prior text-analysis methods in a large pre-registered tournament dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30814","ref_index":105,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Calibration Rankings Reverse: Accuracy-Controlled Evaluation for Fair Comparison of LLMs","primary_cat":"cs.CL","submitted_at":"2026-06-29T18:37:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Global calibration metrics like ECE are confounded by accuracy; the proposed ACE framework with three accuracy-controlled views shows many prior calibration advantages weaken or reverse.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30611","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reweighting Framewise Attention in Video Transformers for Facial Expression Understanding","primary_cat":"cs.CV","submitted_at":"2026-06-29T17:46:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MiRA is a parameter-free frame-marginal attention redistribution technique for ViT video models that improves sensitivity to localized facial cues on FER benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30378","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniCoT: A Benchmark for Global and Multi-Step Panoramic Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-29T14:38:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniCoT is a new panoramic reasoning benchmark with 6.7K eval, 1K real, and 14.3K training examples plus a two-stage SFT+GRPO training method to enforce global 360-degree consistency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30217","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Before Thinking, Learn to Decide: Proactive Routing for Efficient Visual Reasoning","primary_cat":"cs.CL","submitted_at":"2026-06-29T12:30:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PRP introduces proactive routing via Draft Rating Learning and Joint Rating Learning to route queries early between draft and target models for efficient multimodal reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30116","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Open Problems in Constitutional Preference Reconstruction","primary_cat":"cs.AI","submitted_at":"2026-06-29T10:47:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical analysis across three datasets identifies three open problems in constitutional preference reconstruction and shows that principle refinement raises inter-executor agreement from 73% to 78%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30026","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MuseBench: Benchmarking Intent-Level Audiovisual Arts Understanding in MLLMs","primary_cat":"cs.CV","submitted_at":"2026-06-29T09:27:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MuseBench shows state-of-the-art MLLMs achieve only 48.29% accuracy on intent-level audiovisual arts understanding versus 87.18% for human experts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29915","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"H-GRPO: Permutation-Invariant Reinforcement Learning for Grounded Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-29T07:51:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"De-compositional Evidence Grounding decomposes visual reasoning into atomic sub-questions each tied to a specific image region to improve VLM performance and interpretability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29814","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Nemotron-Labs-Diffusion-Image: Advancing Masked Discrete Diffusion for High-Resolution Image Synthesis","primary_cat":"cs.CV","submitted_at":"2026-06-29T05:48:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A masked discrete diffusion model adds token editing at inference and grouped cross-entropy training to reach 0.90 GenEval, 86.9 DPG, and 10.76 HPSv3 scores.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29808","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Making Multimodal LLMs Reliable Chart Data Extractors: A Benchmark and Training Framework","primary_cat":"cs.HC","submitted_at":"2026-06-29T05:40:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces a benchmark for MLLM-based chart data extraction from unlabeled images and a human-centered training framework that reaches SOTA numerical accuracy with a 7B model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29763","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TopoAgent: An Agentic Framework for Automated Topology Learning in Medical Imaging","primary_cat":"cs.CV","submitted_at":"2026-06-29T04:17:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes TopoAgent, an LLM agent framework that automatically selects and configures topological descriptors from persistent homology for medical image analysis without task-specific training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29719","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Diagnostic Framework and Multi-Evaluator Audit of Evaluator-Driven Preference Dynamics in Self-Adapting LLM Agents","primary_cat":"cs.LG","submitted_at":"2026-06-29T02:55:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A diagnostic framework called EPC reveals that proprietary LLM evaluators can exhibit large preference shifts between versions, as evidenced by a GPT-4o May-to-June drift that inverted study conclusions, rendering single-snapshot evaluations unreliable.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29308","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MirrorPPR: Exemplar-Based Portrait Photo Retouching","primary_cat":"cs.CV","submitted_at":"2026-06-28T10:07:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MirrorPPR extracts retouching operations from exemplar pairs via a dedicated extractor and transfers them to query images through a LoRA-adapted Diffusion Transformer, enabled by a new 47-million-pair dataset and self-augmentation for alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29279","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Manufactured Confidence: How Memory Consolidation Turns Hearsay into Confident Facts","primary_cat":"cs.CR","submitted_at":"2026-06-28T08:56:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM memory consolidation turns casual hedged statements into confident facts that agents obey regardless of source or verification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29097","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TrafficAlign: Aligning Large Language Models for Traffic Scenario Generation","primary_cat":"cs.CV","submitted_at":"2026-06-27T21:48:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TrafficAlign synthesizes scenarios from videos to align LLMs, producing tests that detect 10.8% more collisions than prior methods and enable 36.1% collision reduction after fine-tuning the driving models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28968","ref_index":37,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Her: Safety Dynamics in Role-play AI Companions","primary_cat":"cs.CR","submitted_at":"2026-06-27T15:11:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Mixed-methods study of role-play AI companions finds short-term emotional relief that can mask longer-term deterioration, especially among users with internalizing problems who show unstable risk patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28884","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GigaSpeechBench: A Real-World Multilingual Speech-to-Text Benchmark","primary_cat":"eess.AS","submitted_at":"2026-06-27T12:14:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GigaSpeechBench is a new 680-hour in-the-wild multilingual ASR/AST benchmark with five modules for low-resource languages, Chinese dialects, English accents, domain terminology, and age-varied speech, showing model performance drops.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30682","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ALM2Vec: Learning Audio Embeddings for Universal Audio Retrieval with Large Audio-Language Models","primary_cat":"cs.SD","submitted_at":"2026-06-27T03:56:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ALM2Vec learns unified audio embeddings from large audio-language models for text-audio retrieval, instruction-aware retrieval, and other tasks across domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27999","ref_index":1,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HumanMoveVQA: Can Video MLLMs reason about human movement in videos?","primary_cat":"cs.CV","submitted_at":"2026-06-26T11:52:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HumanMoveVQA is a new benchmark that generates 10K+ QA pairs from 3D-lifted video tracks to evaluate video MLLMs on global human trajectory and orientation reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27922","ref_index":13,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reflect-R1: Evidence-Driven Reflection for Self-Correction in Long Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-06-26T10:15:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Reflect-R1 introduces the first evidence-driven self-correction framework for long video understanding using a three-stage pipeline, stage-decoupled RL via SD-GRPO, and a 120K dataset to achieve SOTA on VideoMME and LongVideoBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27871","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LocalNav: Distilling Frontier VLMs and Embodied RL for On-Device Object Goal Navigation","primary_cat":"cs.RO","submitted_at":"2026-06-26T09:11:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Distillation from frontier VLMs plus E-RLVR regularization produces a 4B local model that achieves 34.5% SR on OVON while cutting inference latency by 82.8%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27826","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"NormAct: A Benchmark for Hidden Social Norm Compliance in Embodied Planning","primary_cat":"cs.AI","submitted_at":"2026-06-26T08:10:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NormAct shows MLLMs reach explicit goals in 67.3% of cases but comply with hidden norms in only 26.4%, with NormPerceptor raising task success from 24.2% to 46.7%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27652","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MER-R1: Multimodal Emotion Reasoning via Slow-Fast Thinking Synergy","primary_cat":"cs.AI","submitted_at":"2026-06-26T02:07:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MER-R1 uses dual-objective RL to optimize fast-thinking recall and slow-thinking precision separately in multimodal emotion recognition, with calibration to align them, yielding SOTA results on two benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27443","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Does Personality Composition Matter for Multi-Agent LLM Teams?","primary_cat":"cs.AI","submitted_at":"2026-06-25T18:13:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Empirical study finds that personality composition in multi-agent LLM teams affects performance in a task-dependent manner, with minimal impact on coding milestones but substantial degradation in collaboration and bargaining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26551","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PhyEditBench: A Real-World Multi-Stage Benchmark for Physics-Aware Image Editing","primary_cat":"cs.CV","submitted_at":"2026-06-25T02:57:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PhyEditBench is a new benchmark for physics-aware image editing with real and synthetic instances plus a training-free PhyWorld baseline that uses test-time scaling to outperform SOTA models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25561","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CrypFormBench: Benchmarking Formal Analysis Capability of Large Language Models for Cryptographic Schemes","primary_cat":"cs.CR","submitted_at":"2026-06-24T08:37:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CrypFormBench is a new benchmark jointly covering symbolic and computational security to evaluate LLMs on five formal analysis capabilities, with results showing top model Claude-3.5 scores 48.7/100 and most models struggling on generation, transformation, and correction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25478","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TACO: Towards Task-Consistent Open-Vocabulary Adaptation in Video Recognition","primary_cat":"cs.CV","submitted_at":"2026-06-24T07:06:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TACO proposes Relative Structure Distillation and a lightweight specialization projection to mitigate inconsistency between fine-tuning and evaluation objectives in open-vocabulary video recognition, claiming state-of-the-art results on cross-dataset and base-to-novel benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22873","ref_index":226,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SingGuard: A Policy-Adaptive Multimodal LLM Guardrail with Dynamic Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-22T05:37:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SingGuard introduces a policy-adaptive multimodal LLM guardrail with dynamic reasoning regimes and SingGuard-Bench, reporting SOTA F1 scores across 35 datasets and improved policy-following accuracy under runtime shifts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22657","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Foundation Models for Epileptogenic Zone Identification in Drug-Resistant Epilepsy","primary_cat":"cs.LG","submitted_at":"2026-06-21T20:20:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A signal foundation model trained on over 100,000 minutes of sEEG plus a language model achieves 0.978 contact-level PPV for epileptogenic zone identification under leave-one-patient-out evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22617","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniSpace: Efficient Geometry Awareness for Autonomous Vehicles MLLMs","primary_cat":"cs.CV","submitted_at":"2026-06-21T17:47:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OmniSpace is a plug-and-play method that improves spatial reasoning in MLLMs for AV by injecting camera pose, using epipolar attention across views, and distilling 3D geometric knowledge to overcome weak cross-view correspondence and depth estimation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22537","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"NegAS: Negative Label Guided Attention and Scoring for Out-of-Distribution Object Detection with Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-06-21T14:50:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NegAS uses negative labels for attention guidance and sigmoid scoring to improve OOD detection in VLM-based object detectors while preserving ID performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22497","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Benchmarking Vision-Language Models for Microscopic Plant Image Understanding","primary_cat":"cs.CV","submitted_at":"2026-06-21T13:39:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PlantMicro benchmark shows current VLMs achieve low accuracy (e.g. GPT-5 at 34.93% on pathogen classification) on fine-grained microscopic plant image tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22476","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CVSBench: A Comprehensive Benchmark for Cross-view Spatial Reasoning and Dreaming","primary_cat":"cs.CV","submitted_at":"2026-06-21T12:35:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CVSBench benchmark shows VLMs struggle with cross-view spatial consistency but improve substantially when given 3D scene imagination inputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22471","ref_index":35,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scalable Multi-Task Data Generation via Reinforcement Learning for Language-Conditioned Bimanual Dexterous Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-21T12:31:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"An RL data generation pipeline with generalizable rewards and language annotations produces diverse synthetic datasets that improve multi-task policy generalization on three bimanual manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22409","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Gold Points Sniper: Self-guided Visual Reasoning in VLM for Fine-grained Action Understanding","primary_cat":"cs.CV","submitted_at":"2026-06-21T09:54:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GPS framework adds self-guided reasoning modules to lightweight VLMs for fine-grained action understanding, claiming performance near GPT-4o with better factual accuracy on a custom CAP-based dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22219","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Lost in Aggregation: A Multi-Scale Diagnostic Benchmark for LLM Spatial Navigation","primary_cat":"physics.soc-ph","submitted_at":"2026-06-20T20:41:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new diagnostic benchmark decomposes LLM spatial navigation into three cognitive scales and shows that cross-scale aggregation, not single-level deficits, causes failure beyond small mazes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22138","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BioMatrix: Towards a Comprehensive Biological Foundation Model Spanning the Modality Matrix of Sequences, Structures, and Language","primary_cat":"cs.CL","submitted_at":"2026-06-20T16:38:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BioMatrix unifies sequences, structures, and language for molecules and proteins inside one decoder-only foundation model via shared discrete tokens and achieves SOTA or competitive results on 77 of 80 downstream tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21933","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ISCSLP 2026 CoT-TTS Challenge: Chain-of-Thought Reasoning for Context-Aware Text-to-Speech","primary_cat":"cs.SD","submitted_at":"2026-06-20T08:00:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"The paper announces the ISCSLP 2026 CoT-TTS Challenge with text- and audio-context tracks, large-scale bilingual datasets, and a Qwen3-based baseline requiring both reasoning output and speech generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21734","ref_index":64,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HPP: Hierarchical Programmatic Probing for Long Video Understanding by Decoupling Perception and Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-19T20:43:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HPP decouples perception from reasoning in long-video VLMs by having an LLM run iterative programmatic probes on hierarchically segmented video, reporting gains on LongVideoBench, EgoSchema, VideoMME, and MLVU.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"Qwen3VL-2B as the lightweight perception model. Our method achieves strong performance across all 7 Table 2: Comparison on challenging long-video un- derstanding benchmarks. Our method outperforms or achieves competitive results compared with methods relying on significantly larger vision models. Model Ego LVB MLVU VMME Claude 4.5 Opus - 57.3 - 77.6GPT-4o (OpenAI, 2024) 72.2 66.7 64.6 77.2Gemini 1.5 Pro (Team, 2024) 71.1 64.0 - 81.3 ShareGPT4Video-8B (Chen et al., 2024a) - 39.7 46.4 43.6VideoChat2-7B (Li et al., 2024b) 56.7 39.3 47.9 43.8LongV A-7B (Zhang et al., 2024d) - 51.3 58.8 54.3LLaV A-Video-7B (Zhang et al., 2025b) 57.3 58.2 70.8 69.7 Aria-28B (Li et al., 2025b) - 64.2 72.3 72.1LLaV A-Video-72B (Zhang et al."},{"citing_arxiv_id":"2606.21406","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Robot Self-Improvement via Human-Video Dynamics Models","primary_cat":"cs.RO","submitted_at":"2026-06-19T13:17:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Human-video dynamics models enable cross-embodiment robot self-improvement via training-free Dynamics-Guided Action Correction, raising success rates from 40% to 81% on seven real-world tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21337","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DataClaw0: Agentic Tailoring Multimodal Data from Raw Streams","primary_cat":"cs.LG","submitted_at":"2026-06-19T11:31:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DataClaw0 introduces an agentic data-tailoring paradigm, a 9B model trained on a synthetically generated dataset, and a new benchmark, claiming improved downstream adaptation in video generation, VQA, and GUI navigation under limited data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21077","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OTTER: A Red-Teaming System for Toxicity-Evading Jailbreak Prompt Optimization","primary_cat":"cs.CR","submitted_at":"2026-06-19T03:55:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OTTER optimizes prompts to decouple surface toxicity from adversarial intent, raising attack success rates on GPT models from 7% to 84% across 457 AdvBench examples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20970","ref_index":118,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CogniRoute: Learning to Route Social Evidence in Omni-Modal Models","primary_cat":"cs.CV","submitted_at":"2026-06-18T22:17:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CogniRoute adds a cognitive schema and route-aware RL to an omni-modal MoE, reaching 59.38% accuracy on a new 118K-example social video QA benchmark and beating prior baselines by 15-27 points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20799","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GroundShot: Visually Consistent Multi-Shot Long Video Generation via Entity-Grounded Shot Scheduling","primary_cat":"cs.CV","submitted_at":"2026-06-18T18:00:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GroundShot introduces entity-grounded shot scheduling with online visual memory to improve consistency in multi-shot video generation and presents GroundBench for entity-level evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20543","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SSD: Spatially Speculative Decoding Accelerates Autoregressive Image Generation","primary_cat":"cs.CV","submitted_at":"2026-06-18T17:52:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SSD predicts multiple spatially adjacent tokens at once in autoregressive image models, claiming up to 13.3x inference speedup on DPG-Bench and GenEval with maintained fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20244","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPOT-E: Test-Time Entropy Shaping with Visual Spotlights for Frozen VLMs","primary_cat":"cs.CV","submitted_at":"2026-06-18T13:56:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SPOT-E uses entropy shaping on answer predictions with low-entropy anchors to optimize visual spotlights at test time via GRPO for better VLM performance on evidence-intensive tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19965","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ROSE: Benchmarking the Perception-to-Action Gap in Multimodal Models","primary_cat":"cs.CV","submitted_at":"2026-06-18T09:05:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ROSE benchmark shows MLLMs drop up to 44.5 percentage points from counting tasks to region-conditioned action on identical scenes, with the gap persisting even when counts are correct.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19960","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Stellar: Scalable Multimodal Document Retrieval for Natural Language Queries","primary_cat":"cs.IR","submitted_at":"2026-06-18T08:57:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Stellar reduces memory and latency by 1-2 orders of magnitude for multimodal document retrieval via lexical filtering with a fine-tuned MLLM and disk-backed late interaction without loss in effectiveness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19758","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SIGMA: Skill-Incidence Graphs for Compositional Multi-Agent Design","primary_cat":"cs.MA","submitted_at":"2026-06-18T03:41:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SIGMA introduces skill-incidence graphs to compose agents from reusable skills, yielding higher average performance and robustness than topology-only baselines on reasoning and coding benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19341","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Native Active Perception as Reasoning for Omni-Modal Understanding","primary_cat":"cs.CV","submitted_at":"2026-06-17T17:59:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OmniAgent formulates omni-modal video understanding as a POMDP with on-demand actions that distill cues into persistent text memory, showing positive test-time scaling and SOTA results on benchmarks like LVBench where a 7B model beats a 72B baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19258","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CABLE: Cloud-Assisted Bandwidth-efficient LMM-based Encoding for V2X Systems","primary_cat":"cs.CV","submitted_at":"2026-06-17T16:35:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CABLE uses ego-motion compensated prior masks refined by residual cues and corridor envelopes to select ROIs for cloud LMM inference, cutting pixel upload 73-87% with 5-8x prefill speedup and modest quality loss across five driving datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18996","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TRAP: Benchmark for Task-completion and Resistance to Active Privacy-extraction","primary_cat":"cs.CR","submitted_at":"2026-06-17T12:17:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TRAP benchmark finds leakage in all 22 tested models, proves no soft-constraint defense can achieve high task accuracy with zero leakage for softmax models, and proposes hash-based private field isolation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18988","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ThinkDeception: A Progressive Reinforcement Learning Framework for Interpretable Multimodal Deception Detection","primary_cat":"cs.AI","submitted_at":"2026-06-17T12:08:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ThinkDeception introduces MLLMs, a multimodal CoT dataset, and VAC-GRPO progressive RL to convert deception detection into interpretable reasoning and claims new SOTA accuracy plus rationale quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18890","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Skill-Guided Continuation Distillation for GUI Agents","primary_cat":"cs.AI","submitted_at":"2026-06-17T10:07:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SGCD generates supervision for off-trajectory states in GUI agents by mixing expert trajectories with continuations produced by a skill-guided policy after the base policy reaches those states.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18709","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLMs Struggle to Measure What Distinguishes Students of Different Proficiency Levels: A Study of Item Discrimination in Reading Comprehension Assessment","primary_cat":"cs.CL","submitted_at":"2026-06-17T05:43:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLMs achieve maximum Spearman correlations of 0.152 (direct) and 0.241 (response-based) with human item discrimination values, showing non-random but unreliable signal for distinguishing student proficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20728","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VTOS: Learning to Orchestrate Vision Tools by Co-Searching Solutions and Observers","primary_cat":"cs.CV","submitted_at":"2026-06-17T04:52:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VTOS jointly searches solution and observer programs to adaptively orchestrate vision tools, outperforming static pipelines on dense object counting and zero-shot plant disease segmentation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18249","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unified Multimodal Autoregressive Modeling with Shared Context-Visual Tokenizer is Key to Unification","primary_cat":"cs.CV","submitted_at":"2026-06-16T17:59:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniAR uses a shared context-visual tokenizer with bitwise quantization and parallel prediction in an autoregressive framework to unify visual understanding and generation, claiming SOTA on generation and editing tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18235","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvolveNav: Proactive Preflection and Self-Evolving Memory for Zero-Shot Object Goal Navigation","primary_cat":"cs.AI","submitted_at":"2026-06-16T17:56:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EvolveNav adds an agentic rule memory with UCB retrieval and a memory-guided preflection module to enable continuous improvement in zero-shot object goal navigation, reporting a 10.1% success rate gain over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18216","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Zone of Proximal Policy Optimization: Teacher in Prompts, Not Gradients","primary_cat":"cs.CL","submitted_at":"2026-06-16T17:46:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ZPPO improves distillation to small vision-language models by using binary and negative candidate prompts plus a replay buffer for hard questions, outperforming standard distillation and GRPO on a 31-benchmark suite with largest gains at the 0.8B scale.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18181","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"IUU+DB: Tracking Illegal, Unreported, and Unregulated Fishing, Seafood Fraud, and Labor Abuse through LLM-driven Information Extraction","primary_cat":"cs.IR","submitted_at":"2026-06-16T17:16:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"IUU+DB is an LLM-driven pipeline that classifies documents, extracts structured incident data on IUU+ activities, and supports deduplication and trend analysis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30658","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentic AI Enhances Physician Trust in Clinical Decision Making","primary_cat":"cs.CY","submitted_at":"2026-06-16T14:31:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Empirical comparison shows physicians have higher cognitive and behavioral trust in agentic AI versus baselines on clinical cases, with noted over-reliance risk.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20709","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TeleStyle V2: Beyond Content-Preserving Style Transfer with Self-Distillation and Distribution-Matching-Distillation","primary_cat":"cs.CV","submitted_at":"2026-06-16T10:45:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"TeleStyle V2 uses self-distillation from V1 plus DMD and a prompt enhancer to support RnR/RnS/SnR/SnS reference pairs while matching commercial models on style transfer and general editing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17680","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EnvRL: Learn from Environment Dynamics in Agentic Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-06-16T08:48:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EnvRL incorporates environment dynamics learning via state prediction and inverse dynamics auxiliary objectives into agentic RL, reporting higher success rates than RL-only baselines on ALFWorld and WebShop.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17669","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeSRPA: Decoupled Speech Role-Playing Agent via Inference-Time Intervention","primary_cat":"cs.SD","submitted_at":"2026-06-16T08:30:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeSRPA introduces a dual-level control vector method for inference-time intervention on frozen backbones to improve personality consistency and speech naturalness in role-playing agents over end-to-end fine-tuned baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17657","ref_index":76,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Using Cognitive Models to Improve Language Model Simulation of Human Persuasion Games","primary_cat":"cs.AI","submitted_at":"2026-06-16T08:16:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Equation-to-Behavior Prompting lets large LLMs match cognitive models like Bayesian updating in persuasion games; RL training cuts small-model belief error by 26.5% and improves diverse training outcomes by 2.5-12%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17582","ref_index":62,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Collaborative Large and Small Language Models for Accurate and Scalable Data Repair","primary_cat":"cs.DB","submitted_at":"2026-06-16T06:43:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LasRepair++ pairs an LLM instructor with an SLM corrector, refines context via EM, and down-weights uncertain repairs using column-calibrated confidence, reporting 18.1% average F1 gain over baselines on data repair tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":100,"offset":0}}