{"total":133,"items":[{"citing_arxiv_id":"2606.22945","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Controllable Texture Tiling with Transformed RoPE-Enhanced Diffusion Models","primary_cat":"cs.GR","submitted_at":"2026-06-22T07:24:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A Diffusion Transformer framework applies coordinate-transformed RoPE and disjoint attention masks to achieve controllable, high-fidelity texture tiling that preserves reference structure and scene lighting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06066","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FontFusion: Enhancing Generative Text in Diffusion Models with Typographic Conditioning","primary_cat":"cs.CV","submitted_at":"2026-06-04T12:07:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FontFusion adds hierarchical token conditioning, position-aware embeddings, and multi-level dropping to DiT diffusion models, yielding 76% relative gains on decorative fonts and 68-76% consistency improvements via a dual DeepFont+DINOv2 encoder.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01493","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Splatshot: 3D Face Avatar Generation from a Single Unconstrained Photo","primary_cat":"cs.CV","submitted_at":"2026-05-31T23:19:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SplatShot is a training-free method that inserts per-step 3DGS refitting and photometric feedback into diffusion denoising to enforce multi-view consistency for single-photo 3D face avatars.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01079","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Chameleon: Style-Content Disentangled Framework for Cross-Domain Object Compositing","primary_cat":"cs.CV","submitted_at":"2026-05-31T07:54:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Chameleon proposes the first large-scale cross-domain compositing dataset and a disentangled encoder plus gated diffusion transformer that outperforms prior in-domain and cross-domain methods on plausibility and fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00137","ref_index":104,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Advances in Neural 3D Mesh Texturing: A Survey","primary_cat":"cs.CV","submitted_at":"2026-05-28T23:18:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A literature survey that organizes neural 3D mesh texturing methods into a taxonomy spanning early GAN-based approaches to modern diffusion pipelines, while reviewing architectures, datasets, evaluation, and open challenges.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30512","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PhyDrawGen: Physically Grounded Diagram Generation from Natural Language","primary_cat":"cs.AI","submitted_at":"2026-05-28T19:49:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PhyDrawGen is a neuro-symbolic pipeline that extracts typed scene graphs via LLM, converts them to physically constrained PSLGs via deterministic solver, and refines via fine-tuned Qwen-VL, claiming superior performance over GPT-5-image and Gemini models on 1,449 physics problems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30282","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Gaze2Act: Gaze-Conditioned Vision-Language-Action Policies for Interactive Robot Manipulation","primary_cat":"cs.RO","submitted_at":"2026-05-28T17:37:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Gaze2Act conditions VLA policies on mapped human gaze for precise object and interaction specification, reporting SOTA intent accuracy and success across 16 real-robot tasks on a Unitree G1 humanoid.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30230","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"IP-Adapter Is All You Need: Towards Fine-Tuning-Free Diffusion-Based Talking Face Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:00:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A fine-tuning-free framework combines pretrained Stable Diffusion with IP-Adapter plus three parameter-free modules to achieve improved lip synchronization and visual quality in talking face generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00121","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Versatile Framework with Semantic and Structural guidance for Image Reconstruction from Brain Activity","primary_cat":"cs.CV","submitted_at":"2026-05-28T09:20:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MindDiffuser is a two-stage framework that improves semantic and structural fidelity in reconstructing visual stimuli from fMRI, EEG, and MEG signals using CLIP embeddings and Stable Diffusion with backpropagation refinement.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28657","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DEMON: Diffusion Engine for Musical Orchestrated Noise","primary_cat":"cs.SD","submitted_at":"2026-05-27T15:57:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DEMON is a streaming diffusion engine that exposes denoising parameters as playable controls at up to 12.3 decoder completions per second via per-slot scheduling, shared state, source blending, and accelerated decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23610","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EM-Vid: Training-Free Entity-Centric Memory for Efficient and Consistent Multi-Shot Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-22T13:20:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EM-Vid introduces an entity-centric latent patch memory bank with sparse token conditioning and budgeted updates for training-free consistent multi-shot video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23137","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"STAMBRIDGE: Spectral-Temporal Amplitude-aware Mid-Feature Bridge for EEG Visual Decoding","primary_cat":"eess.IV","submitted_at":"2026-05-22T01:21:45+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22311","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PIU: Proximity-guided Identity Unlearning in ID-Conditioned Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-05-21T10:55:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PIU suppresses target identity generation in Arc2Face by replacing it with a proximity-selected anchor identity through localized fine-tuning of cross-attention layers while preserving output quality for other identities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21207","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PGC: Peak-Guided Calibration for Generalizable AI-Generated Image Detection","primary_cat":"cs.CV","submitted_at":"2026-05-20T14:04:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PGC introduces peak-focusing aggregation of local discriminative clues to calibrate global representations for AI-generated image detection, reporting accuracy gains on a new 15-model commercial benchmark and standard datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20807","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decomposing Subject-Driven Image Generation via Intermediate Structural Prediction","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:58:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A two-stage method predicts an intermediate Canny map for structure then renders the image conditioned on appearance and structure, paired with a 100k text-aware dataset, to improve detail preservation in subject-driven generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20777","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AttriStory: Fine-grained Attribute Realization for Visual Storytelling with Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:17:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AttriStory adds a benchmark and AttriLoss-based latent optimization to improve faithful rendering of fine-grained attributes such as clothing color and texture in diffusion-model visual storytelling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20309","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tiny-Engram: Trigger-Indexed Concept Tables for Generative Vision","primary_cat":"cs.CV","submitted_at":"2026-05-19T16:27:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Tiny-Engram uses small n-gram-indexed memory tables to bind trigger phrases to target visual identities in diffusion models while preserving compositional control from the surrounding prompt.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18010","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Functionalization via Structure Completion and Motion Rectification","primary_cat":"cs.CV","submitted_at":"2026-05-18T08:05:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Object functionalization is cast as neural graph completion over a functional graph of parts, contacts, and motions, followed by geometry realization that also rectifies erroneous motions, demonstrated on furniture with a new paired dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17312","ref_index":56,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VISTA: Triplet-Supervised Video Style Transfer with Diffusion Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-17T08:03:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VISTA introduces a new synthetic triplet dataset and diffusion-transformer framework with style adapter that jointly models style, content, and motion to achieve state-of-the-art video style transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16990","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DreamEdit3D: Personalization of Multi-View Diffusion Models for 3D Editing","primary_cat":"cs.CV","submitted_at":"2026-05-16T13:21:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DreamEdit3D learns separate token embeddings for segmented object components via two-phase multi-view optimization to enable text-guided 3D editing with consistent image generation and mesh reconstruction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16732","ref_index":78,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DiRotQ: Rotation-Aware Quantization for 4-bit Diffusion Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-16T00:52:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DiRotQ uses PCA-based rotation-aware activation quantization combined with GPTQ to achieve better FID and PSNR in 4-bit diffusion transformers than prior methods like SVDQuant.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16471","ref_index":146,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From AI-Generated Content to Agentic Action: Security and Safety Threats in Generative AI","primary_cat":"cs.CR","submitted_at":"2026-05-15T13:53:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The paper analyzes evolving security and safety threats in generative AI from content generation to agentic actions, noting that attack surfaces expand faster than defenses and that many safeguards require institutional coordination not yet in place.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Jailbreaking is treated as a search problem over prompt space. GCG [155] uses greedy coordinate gradient descent to identify adversarial suffixes, achieving 88% ASR on Vicuna-7B with cross-model transferability. PAIR [21] iteratively refines prompts via an attacker LLM in under twenty queries (71% on GPT-3.5, 34% on GPT-4 per JailbreakBench [20]). GPTFuzzer [146] automates template mutation, reporting>90% ASR on some targets. Multi-turn conversational.Gradual boundary erosion across dialogue turns exploits the model's tendency to maintain coherence with prior responses. Crescendo [111] achieves 29-61% higher performance than single-turn methods on GPT-4. Many-shot jailbreaking [3] uses hundreds of in-context demonstrations, with effectiveness"},{"citing_arxiv_id":"2605.15921","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AdaEraser: Training-Free Object Removal via Adaptive Attention Suppression","primary_cat":"cs.CV","submitted_at":"2026-05-15T13:03:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AdaEraser introduces token-wise adaptive attention suppression in diffusion denoising to enable high-quality training-free object removal by modulating suppression according to evolving self-attention maps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15681","ref_index":63,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DealMaTe: Multi-Dimensional Material Transfer via Diffusion Transformer","primary_cat":"cs.GR","submitted_at":"2026-05-15T07:06:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DealMaTe proposes a simplified diffusion framework for material transfer that injects multi-dimensional 3D conditions via Multi-Dim 3D Shader LoRA and Shader Causal Mutual Attention with KV caching.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15660","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MaTe: Images Are All You Need for Material Transfer via Diffusion Transformer","primary_cat":"cs.CV","submitted_at":"2026-05-15T06:31:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MaTe proposes a training-free diffusion transformer that performs material transfer using only images by integrating them at the token level for unified multi-modal attention in a shared latent space.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13386","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Support-Conditioned Flow Matching Is Kernel Smoothing","primary_cat":"cs.LG","submitted_at":"2026-05-13T11:44:14+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Support-conditioned flow matching under the Gaussian OT path is exactly Nadaraya-Watson kernel smoothing with time-decreasing bandwidth, implemented by a single Gaussian attention head.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12939","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DirectTryOn: One-Step Virtual Try-On via Straightened Conditional Transport","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:18:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DirectTryOn achieves state-of-the-art one-step virtual try-on performance by applying pure conditional transport, garment preservation loss, and self-consistency loss to straighten trajectories in pretrained generative models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12650","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CRAFT: Clinical Reward-Aligned Finetuning for Medical Image Synthesis","primary_cat":"cs.CV","submitted_at":"2026-05-12T18:56:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CRAFT adapts diffusion models to medical images via clinical reward alignment from LLMs and VLMs, improving alignment scores and cutting low-quality generations by 20.4% on average across modalities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12305","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Images in Sentences: Scaling Interleaved Instructions for Unified Visual Generation","primary_cat":"cs.CV","submitted_at":"2026-05-12T15:54:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"INSET embeds images as native tokens in interleaved instructions, outperforming prior methods on multi-image consistency and text alignment as complexity grows.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"arXiv preprint arXiv:2408.12528, 2024. [46] Jinheng Xie, Zhenheng Yang, and Mike Zheng Shou. Show-o2: Improved native unified multimodal models.arXiv preprint arXiv:2506.15564, 2025. [47] Hu Ye, Jun Zhang, Sibo Liu, Xiao Han, and Wei Yang. Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models.arXiv preprint arXiv:2308.06721, 2023. [48] Junyan Ye, Dongzhi Jiang, Zihao Wang, Leqi Zhu, Zhenghao Hu, Zilong Huang, Jun He, Zhiyuan Yan, Jinghua Yu, Hongsheng Li, et al. Echo-4o: Harnessing the power of gpt-4o synthetic images for improved image generation. arXiv preprint arXiv:2508.09987, 2025. [49] Chunting Zhou, Lili Yu, Arun Babu, Kushal Tirumala, Michihiro Yasunaga, Leonid Shamis, Jacob Kahn, Xuezhe"},{"citing_arxiv_id":"2605.12271","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Text Prompts: Visual-to-Visual Generation as A Unified Paradigm","primary_cat":"cs.CV","submitted_at":"2026-05-12T15:35:34+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Li, and Yong Jae Lee. Gligen: Open-set grounded text-to-image generation. InProceedings of the IEEE/CVF conference on computer vision and pattern recognition, pages 22511-22521, 2023. [27] Hu Ye, Jun Zhang, Sibo Liu, Xiao Han, and Wei Yang. Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models.arXiv preprint arXiv:2308.06721, 2023. [28] Nataniel Ruiz, Yuanzhen Li, Varun Jampani, Yael Pritch, Michael Rubinstein, and Kfir Aberman. Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation. In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition, pages 22500-22510, 2023. [29] Andreas Blattmann, Tim Dockhorn, Sumith Kulal, Daniel Mendelevitch, Maciej Kilian, Do-"},{"citing_arxiv_id":"2605.12119","ref_index":42,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MoCam: Unified Novel View Synthesis via Structured Denoising Dynamics","primary_cat":"cs.CV","submitted_at":"2026-05-12T13:35:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MoCam unifies static and dynamic novel view synthesis by temporally decoupling geometric alignment and appearance refinement within the diffusion denoising process.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12088","ref_index":42,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniCustom: Unified Visual Conditioning for Multi-Reference Image Generation","primary_cat":"cs.CV","submitted_at":"2026-05-12T13:10:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A unified visual conditioning approach fuses semantic and appearance features before VLM processing, with two-stage training and slot-wise regularization, to improve consistency in multi-reference image generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12013","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"L2P: Unlocking Latent Potential for Pixel Generation","primary_cat":"cs.CV","submitted_at":"2026-05-12T12:01:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"L2P repurposes pre-trained LDMs for direct pixel generation via large-patch tokenization and shallow-layer training on synthetic data, matching source performance with 8-GPU training and enabling native 4K output.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11927","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RealDiffusion: Physics-informed Attention for Multi-character Storybook Generation","primary_cat":"cs.CV","submitted_at":"2026-05-12T10:39:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RealDiffusion uses heat diffusion as a dissipative prior and a region-aware stochastic process inside a training-free physics-informed attention mechanism to improve multi-character coherence while preserving narrative dynamism in sequential image generation.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"eration [16, 21], and video production [1, 11, 29]. Leverag- ing large-scale pre-training on massive datasets, these mod- els now outperform earlier approaches in producing high- fidelity and coherent generative content. Current approaches range from slow fine-tuning methods like DreamBooth [26] and Textual Inversion [6], to zero- shot ID injection with encoders like IP-Adapter [38], Pho- toMaker [15], and InstantID [36], but these sacrifice full- body coherence for facial fidelity. The most relevant line of work involves training-free methods that enforce coher- ence by manipulating internal features. This is achieved through shared self-attention [33, 40], addressing attention biases [35], prompt engineering [17], or novel sampling"},{"citing_arxiv_id":"2605.10302","ref_index":37,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Follow the Mean: Reference-Guided Flow Matching","primary_cat":"cs.LG","submitted_at":"2026-05-11T09:57:34+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Representations, 2023. URLhttps://openreview.net/forum?id=_CDixzkzeyb. [36] Mingdeng Cao, Xintao Wang, Zhongang Qi, Ying Shan, Xiaohu Qie, and Yinqiang Zheng. Masactrl: Tuning-free mutual self-attention control for consistent image synthesis and editing. InProceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pages 22560-22570, October 2023. [37] Hu Ye, Jun Zhang, Sibo Liu, Xiao Han, and Wei Yang. IP-adapter: Text compatible image prompt adapter for text-to-image diffusion models, 2023. URLhttps://arxiv.org/abs/2308.06721. [38] Dongxu Li, Junnan Li, and Steven C. H. Hoi. BLIP-diffusion: Pre-trained subject representation for controllable text-to-image generation and editing. InAdvances in Neural Information Processing Systems,"},{"citing_arxiv_id":"2605.10127","ref_index":54,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fashion130K: An E-commerce Fashion Dataset for Outfit Generation with Unified Multi-modal Condition","primary_cat":"cs.CV","submitted_at":"2026-05-11T07:40:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fashion130K dataset and UMC framework align text and visual prompts to generate more consistent fashion outfits than prior state-of-the-art methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09460","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Few Steps Are Enough: Training-Free Acceleration of Identity-Preserved Generation","primary_cat":"cs.CV","submitted_at":"2026-05-10T10:19:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Frozen identity adapter from FLUX dev works on distilled schnell model, enabling 5.9x faster generation with better identity preservation in few steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09425","ref_index":96,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AtteConDA: Attention-Based Conflict Suppression in Multi-Condition Diffusion Models and Synthetic Data Augmentation","primary_cat":"cs.CV","submitted_at":"2026-05-10T08:56:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"AtteConDA adds attention-based conflict suppression to multi-condition diffusion models so that generated driving-scene images retain richer structural cues from the original annotations.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"ControlNet freezes a pretrained text-to-image diffusion model and adds trainable control branches for spatial conditions such as edge, depth, and segmentation [102]. T2I-Adapter adds lightweight adapterstoextractcontrollabilityfrompretrainedmodels[ 53]. GLIGENperformsopen-setgrounding by injecting grounding inputs [46]. IP-Adapter integrates image prompts with text-compatible conditioning [96]. MultiDiffusion fuses multiple generation paths for spatial control, and SDEdit, Prompt-to-Prompt, and InstructPix2Pix extend diffusion to editing [1, 52, 28, 5]. Uni-ControlNet is particularly relevant because it processes multiple local controls and global controls in one composable framework [104]. It supports Canny edge, MLSD edge, HED boundary,"},{"citing_arxiv_id":"2605.09007","ref_index":76,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Detecting Deception, Not Deepfakes: Why Media Forensics Needs Social Theories","primary_cat":"cs.CY","submitted_at":"2026-05-09T15:46:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Deepfake detection must shift from classifying media realism to detecting communicative deception by applying Speech Act Theory, Grice's Cooperative Principle, and Cialdini's influence principles.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"International Conference on Learning Representations (ICLR), 2022. [75] Nataniel Ruiz, Yuanzhen Li, Varun Jampani, Yael Pritch, Michael Rubinstein, and Kfir Aberman. DreamBooth: Fine tuning text-to-image diffusion models for subject-driven generation. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pages 22500-22510, June 2023. [76] Hu Ye, Jun Zhang, Sibo Liu, Xiao Han, and Wei Yang. IP-Adapter: Text compatible image prompt adapter for text-to-image diffusion models.arXiv preprint arXiv:2308.06721, 2023. 14 A Appendix A.1 Core Components of Social-Theoretic Frameworks Table A.1: Core components of the three social-theoretic frameworks, their analytical level, and key conditions required for valid communication."},{"citing_arxiv_id":"2605.08031","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Object Hallucination-Free Reinforcement Unlearning for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-08T17:19:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HFRU is a two-stage reinforcement unlearning method operating on the vision encoder with GRPO optimization and an abstraction reward that achieves over 98% forgetting and retention on object and face tasks with negligible hallucination.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"solution of KL-regularized RL, the optimal policy π(y|x, p) takes a softmax-like form, where the exponentiated reward is reweighted by the reference policy and normalized by a partition func- tion [Lambert, 2026]. Specifically, under the penalty-only rewardR pen, the optimal policy is: πpen(y|x, p) = πref(y|x, p) exp \u0010 Rpen(y) β \u0011 Zpen(x, p) ,(14) where the partition function is Zpen(x, p) = X y′ πref(y′ |x, p) exp \u0012 Rpen(y′) β \u0013 .(15) Under the composite rewardR forget =R pen +R abs, the optimal policy becomes: πcomp(y|x, p) = πref(y|x, p) exp \u0010 Rpen(y)+Rabs(y) β \u0011 Zcomp(x, p) ,(16) with Zcomp(x, p) = X y′ πref(y′ |x, p) exp \u0012 Rpen(y′) +R abs(y′) β \u0013 .(17) 14 For any (x, p)∈ D f , we have Rabs(y)≥0 for all y, and Rabs(y)>0 for sequences containing hypernyms w∈Hyper(D f). Since πref assigns non-zero probability to such sequences, it follows"},{"citing_arxiv_id":"2605.07940","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Delta-Adapter: Scalable Exemplar-Based Image Editing with Single-Pair Supervision","primary_cat":"cs.CV","submitted_at":"2026-05-08T16:09:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Delta-Adapter extracts a semantic delta from a single image pair via a pre-trained vision encoder and injects it through a Perceiver adapter to enable scalable single-pair supervised editing.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"The gate is initialized to zero, ensuring the model first learns a stable semantic delta representation before gradually incorporating residual corrections. 4.3 Semantic Delta Projection and Injection Given the extracted semantic delta ˜∆a→a′, we project it into a fixed-length sequence of conditioning tokens and inject them into the DiT-based editing backbone. Perceiver-based resampling.Prior IP-Adapter-style methods [ 55, 15] map visual encoder features into the generative model via global average pooling followed by an MLP. For exemplar-based editing, however, we find this design generalizes poorly to unseen tasks. We attribute this limitation to the pooling operation: collapsing ˜∆a→a′ into a single global vector discards the localized and relational changes that are critical for faithfully representing the intended edit."},{"citing_arxiv_id":"2605.07861","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Synthetic to Real: Toward Identity-Consistent Makeup Transfer with Synthetic and Real Data","primary_cat":"cs.CV","submitted_at":"2026-05-08T15:21:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The work creates identity-consistent synthetic makeup data via ConsistentBeauty and adapts models to real images using reinforcement learning in RealBeauty, achieving better identity preservation and real-world performance than prior methods.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Nichol, C. Chu, and M. Chen, \"Hierarchical text-conditional image generation with clip latents,\"arXiv preprint arXiv:2204.06125, vol. 1, no. 2, p. 3, 2022. [26] H. Ye, J. Zhang, S. Liu, X. Han, and W. Yang, \"Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models,\" 2023. [Online]. Available: https://arxiv.org/abs/2308.06721 [27] N. Ruiz, Y . Li, V . Jampani, Y . Pritch, M. Rubinstein, and K. Aberman, \"Dreambooth: Fine tuning text-to-image diffusion models for subject- driven generation,\" inProceedings of the IEEE/CVF conference on computer vision and pattern recognition, 2023, pp. 22 500-22 510. [28] T. Brooks, A. Holynski, and A. A. Efros, \"Instructpix2pix: Learning to follow image editing instructions,\" inProceedings of the IEEE/CVF"},{"citing_arxiv_id":"2605.07257","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Adaptive Subspace Projection for Generative Personalization","primary_cat":"cs.CV","submitted_at":"2026-05-08T05:24:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A training-free adaptive subspace projection method mitigates semantic collapsing in generative personalization by isolating and adjusting drift in a low-dimensional subspace using the stable pre-trained embedding as anchor.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"personalization, person/identity-driven methods focus on facial identity and composition, including FastComposer, Face0, DreamIdentity, and PhotoVerse [40, 38, 9, 8]. Style-driven personalization explores controllable stylistic transfer, e.g., StyleDrop, StyleCrafter, and ArtAdapter [ 33, 22, 6]. 2 Image- or adapter-based conditioning has also been explored to reduce tuning cost, such as IP- Adapter, PhotoMaker, and InstantID [41, 21, 39], which inject image-derived identity cues through lightweight modules. Challenges in Personalization.Personalization from a few examples faces several recurring chal- lenges: language drift and overfitting, limited expressiveness of the conditioning signal, entanglement of concepts in reference sets, and prompt misalignment. Overfitting is commonly mitigated through"},{"citing_arxiv_id":"2605.07074","ref_index":33,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decoupling Semantics and Fingerprints: A Universal Representation for AI-Generated Image Detection","primary_cat":"cs.CV","submitted_at":"2026-05-08T00:48:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ODP-Net uses instance-aware orthogonal decomposition, perturbation-based purification, and manifold alignment to separate universal forgery traces, generator fingerprints, and semantics, achieving SOTA on unseen architectures like Stable Diffusion 3.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"mains to simulate the realistic scenario where detectors must face unknown synthesis methods. The Source Domain (Training) comprises eight diverse architectures: ProGAN [2], R3GAN [24], BLIP [25], Infinite-ID [26], DALLE-3 [27], FLUX1-dev [28], BlendFace [29], and E4S [30]. The Target Domain (Unseen Evaluation) consists of eight novel genera- tors: StyleSwim [31], WFIR1, InstantID [32], IP-Adapter [33], Midjourney2, SD3 [34], FaceSwap 3, and InSwap 4. Addi- tionally, we evaluate on the In-the-Wild subset (Commu- nityAI [23] and SocialRF [23]) to assess real-world robustness. Evaluation Metrics.We employ Balanced Accuracy (bAcc) to measure classification performance and Negative Log Like- lihood (NLL) to evaluate probability calibration. bAcc is"},{"citing_arxiv_id":"2605.05204","ref_index":109,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"D-OPSD: On-Policy Self-Distillation for Continuously Tuning Step-Distilled Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-05-06T17:59:34+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"arXiv:2505.07818 (2025) [107] Yang, A., Li, A., Yang, B., Zhang, B., Hui, B., Zheng, B., Yu, B., Gao, C., Huang, C., Lv, C., et al.: Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025) 21 [108] Yang, C., Qin, C., Si, Q., Chen, M., Gu, N., Yao, D., Lin, Z., Wang, W., Wang, J., Duan, N.: Self-distilled rlvr. arXiv preprint arXiv:2604.03128 (2026) [109] Ye, H., Zhang, J., Liu, S., Han, X., Yang, W.: Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:2308.06721 (2023) [110] Ye, T., Dong, L., Wu, X., Huang, S., Wei, F.: On-policy context distillation for language models. arXiv preprint arXiv:2602.12275 (2026) [111] Yin, T., Gharbi, M., Park, T."},{"citing_arxiv_id":"2605.05079","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A unified Benchmark for Multi-Frame Image Restoration under Severe Refractive Warping","primary_cat":"cs.CV","submitted_at":"2026-05-06T16:14:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Presents the first large-scale benchmark for multi-frame geometric distortion removal in videos under severe refractive warping, using real and synthetic data across four distortion levels and evaluating classical and learning-based methods including a proposed diffusion-based V-cache.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04609","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Advancing Aesthetic Image Generation via Composition Transfer","primary_cat":"cs.CV","submitted_at":"2026-05-06T07:56:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Composer enables semantic-agnostic composition transfer from references and theme-driven planning via LVLMs to improve aesthetic quality in diffusion-based image generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04412","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Structured 3D Latents Are Surprisingly Powerful: Unleashing Generalizable Style with 2D Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-06T02:08:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DiLAST optimizes 3D latents via guidance from a 2D diffusion model to enable generalizable style transfer for OOD styles in 3D asset generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02814","ref_index":101,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"IConFace: Identity-Structure Asymmetric Conditioning for Unified Reference-Aware Face Restoration","primary_cat":"cs.CV","submitted_at":"2026-05-04T16:49:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"IConFace performs unified reference-aware and no-reference blind face restoration by asymmetrically conditioning identity from references and structure from the degraded image.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02521","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MooD: Perception-Enhanced Efficient Affective Image Editing via Continuous Valence-Arousal Modeling","primary_cat":"cs.CV","submitted_at":"2026-05-04T12:22:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MooD introduces continuous valence-arousal modeling with VA-aware retrieval and perception-enhanced guidance for efficient, controllable affective image editing, plus a new AffectSet dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}