{"total":13,"items":[{"citing_arxiv_id":"2607.00310","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RetailSMV: Exocentric vs. Egocentric Adaptation of Foundation Video World Models in Retail","primary_cat":"cs.CV","submitted_at":"2026-07-01T01:23:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Exocentric-only LoRA adaptation of Cosmos3-Nano on a new synchronized retail video dataset matches or exceeds combined ego+exo training on most held-out metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27988","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Latent Visual Diffusion Reasoning with Monte Carlo Tree Search","primary_cat":"cs.CV","submitted_at":"2026-06-26T11:35:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LVDR integrates keypoint-guided MCTS into a latent diffusion reasoning model to deliver competitive skill assessment accuracy alongside explicit visual reasoning trajectories on four sports and surgical datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09547","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Streaming Interventions: Can Video Large Language Models Correct Mistakes as They Occur?","primary_cat":"cs.CV","submitted_at":"2026-06-08T14:27:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces Ego-MC-Bench benchmark and Ego-CoMist synthetic dataset showing that fine-tuning video LLMs on proactive mistake corrections improves performance especially for smaller models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08615","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Harnessing Streaming Video in the Wild","primary_cat":"cs.CV","submitted_at":"2026-06-07T13:00:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Presents Streaming-Train-248K dataset, Streaming Harness system, and Streaming-Eval benchmark to enable VLMs for proactive, memory-equipped streaming video understanding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23045","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The TIME Machine: On The Power of Motion for Efficient Perception","primary_cat":"cs.CV","submitted_at":"2026-05-21T21:22:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TIME is a motion-based embedding from point tracks, trained only on synthetic data via masked autoencoding, that matches state-of-the-art video model performance with up to 10,000x less training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22190","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"No Pose, No Problem in 4D: Feed-Forward Dynamic Gaussians from Unposed Multi-View Videos","primary_cat":"cs.CV","submitted_at":"2026-05-21T08:57:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NoPo4D is the first feed-forward system for dynamic 4D Gaussian splatting from unposed multi-view videos, using velocity decomposition supervised by optical flow and a bidirectional motion encoder.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12090","ref_index":189,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"World Action Models: The Next Frontier in Embodied AI","primary_cat":"cs.RO","submitted_at":"2026-05-12T13:10:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper introduces World Action Models as a new paradigm unifying predictive world modeling with action generation in embodied foundation models and provides a taxonomy of existing approaches.","context_count":1,"top_context_role":"dataset","top_context_polarity":"background","context_text":"Ego4D [167], HOI4D [168], EgoVid-5M [169], COM Kitchens [ 170], Egocentric-10k [ 171], DreamDojo [ 35] Assembly101 [172], H2O [ 173], EgoP AT3D [174], Ego-Exo4D [175], ARCTIC [176], HoloAssist [177] HOT3D [178], TACO [179], Kaiwu [ 180], OAKINK2 [181], Nymeria [ 182], EgoMimic [183] PH2D [184], Humanoid Everyday [185], IndEgo [ 186], PLAICraft [187], HD-EPIC [ 188], UniHand [189] Ego-Centric Human Manipulation Dataset [ 190], Aria Everyday Activities [ 191], EgoDex [ 192] Evaluation World Model Visual Fidelity PSNR, SSIM [ 193], LPIPS [ 194], DreamSim [ 195], DINO [196], FVD [197] Physical Commonsense VideoPhy [198], PhyGenBench [199], VBench-2.0 [ 200], WorldModelBench [201] Physics-IQ [202], WorldScore [203], EWMBench [ 204]"},{"citing_arxiv_id":"2605.06747","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HumanNet: Scaling Human-centric Video Learning to One Million Hours","primary_cat":"cs.CV","submitted_at":"2026-05-07T15:21:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HumanNet is a 1M-hour human-centric video dataset with interaction annotations that enables better vision-language-action model performance than equivalent robot data in a controlled test.","context_count":1,"top_context_role":"dataset","top_context_polarity":"background","context_text":"actions are executed, exposing contact dynamics, hand-object relations, temporal intent, and the visual consequences of motor decisions. Third-person video complements this signal by making full-body motion, posture, interaction context, surrounding agents, and scene-level dynamics easier to observe. Large-scale community resources such as Ego4D [13], EPIC-KITCHENS [7], Ego-Exo4D [14], and EgoSchema [25] have expanded recognition, forecasting, narration, and multimodal understanding from egocentric and paired exocentric video, while structured interaction resources such as HOI4D [21] show the value of dense hand-object supervision. Recent work has shown that human-centered data can improve robot learning and representation learning [9, 18, 28, 30, 38], but current corpora remain limited in duration, fragmented across collection efforts,"},{"citing_arxiv_id":"2605.02881","ref_index":15,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MolmoAct2: Action Reasoning Models for Real-world Deployment","primary_cat":"cs.RO","submitted_at":"2026-05-04T17:51:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MolmoAct2 is an open VLA model that outperforms baselines like Pi-05 on 7 benchmarks and whose backbone surpasses GPT-5 on 13 embodied-reasoning tasks through new datasets, specialized training, and architecture changes for lower latency.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"where m masks padded action steps and padded action dimensions. In post-training, we evaluate multiple sampled flow-matching times for each robot action chunk while reusing the same VLM context. At inference, we initialize the trajectory from Gaussian noise and integrate the learned velocity field for a fixed number of Euler steps: zi+1 =z i +∆t f θ(zi, ti, c), t i = i N ,∆t= 1 N .(15) The released checkpoints use N= 10inference steps. The final normalized trajectory is sliced to the embodiment-specific action width and unnormalized with the corresponding dataset statistics. Expert block.The action expert has one transformer block for each VLM layer, givingL= 36expert blocks. A noisy action chunk is first projected from 32 continuous dimensions to the expert hidden width."},{"citing_arxiv_id":"2604.10466","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ExpertEdit: Learning Skill-Aware Motion Editing from Expert Videos","primary_cat":"cs.CV","submitted_at":"2026-04-12T05:25:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ExpertEdit edits novice motions to expert skill levels by learning a motion prior from unpaired videos and infilling masked skill-critical spans.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"masked language modeling objective that infills masked motion spans with expert-level refinements. At inference, novice motion is masked at skill-critical moments and projected into the learned expert manifold, producing localized skill improvements without paired supervision or manual edit guidance. Across eight diverse techniques and three sports from Ego-Exo4D [13] and Karate Kyokushin [49], ExpertEdit outperforms state-of-the-art supervised motion editing methods on multiple metrics of motion realism and expert quality. Project page:https://vision.cs. utexas.edu/projects/expert_edit/ Keywords:Pose editing·Motion generation·Skilled activity under- standing 1 Introduction Imagine watching a video of yourself performing a layup, but with the smooth"},{"citing_arxiv_id":"2511.21998","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Can Multi-Modal LLMs Provide Live Step-by-Step Task Guidance?","primary_cat":"cs.CV","submitted_at":"2025-11-27T00:54:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces the first dedicated benchmark for live multi-modal LLM task guidance with mistake detection and a streaming baseline model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.00714","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SAM 2: Segment Anything in Images and Videos","primary_cat":"cs.CV","submitted_at":"2024-08-01T17:00:08+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SAM 2 delivers more accurate video segmentation with 3x fewer user interactions and 6x faster image segmentation than the original SAM by training a streaming-memory transformer on the largest video segmentation dataset collected to date.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2407.08101","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"What to Say and When to Say it: Live Fitness Coaching as a Testbed for Situated Interaction","primary_cat":"cs.CV","submitted_at":"2024-07-11T00:10:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces the QEVD benchmark for asynchronous situated interaction in fitness coaching and proposes a streaming baseline to address limitations of existing vision-language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}