{"total":46,"items":[{"citing_arxiv_id":"2605.27178","ref_index":89,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FoundObj: Self-supervised Foundation Models as Rewards for Label-free 3D Object Segmentation","primary_cat":"cs.CV","submitted_at":"2026-05-26T15:32:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FoundObj uses foundation-model priors as RL rewards to discover multi-class 3D objects from point clouds without scene-level labels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26109","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Helix4D: Complex 4D Mesh Generation","primary_cat":"cs.CV","submitted_at":"2026-05-25T17:59:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Helix4D generates high-quality dynamic 4D meshes from videos by extending Trellis2 with sliding-window cross-frame attention anchored on the first frame and a repurposed 4D temporal encoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23888","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GenRecon: Bridging Generative Priors for Multi-View 3D Scene Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:49:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GenRecon lifts object-level generative priors to scene-scale reconstruction by chunking scenes and using projection-based conditioning on multi-view features, claiming 16% better results than prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21472","ref_index":86,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Stream3D: Sequential Multi-View 3D Generation via Evidential Memory","primary_cat":"cs.CV","submitted_at":"2026-05-20T17:55:16+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21121","ref_index":77,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ROAR-3D: Routing Arbitrary Views for High-Fidelity 3D Generation","primary_cat":"cs.CV","submitted_at":"2026-05-20T12:50:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ROAR-3D adds a token-wise view router and dual-stream attention to pretrained single-view 3D generators so they can use arbitrary unposed images for higher-fidelity output.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18680","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CMAG: Concept-Scaffolded Retrieval for Marketplace Avatar Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T17:21:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CMAG combines 3D concept scaffolding, prompt decomposition, taxonomy routing, hybrid retrieval, and agentic VLM verification to assemble topologically consistent avatars from catalog assets given free-form text prompts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17853","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CelloCut: Constructive Watertight Remeshing via Tetrahedral Cell Cuts","primary_cat":"cs.GR","submitted_at":"2026-05-18T04:49:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CelloCut formulates watertight remeshing as binary labeling on a Delaunay tetrahedral partition solved by graph-cut minimization with one-sided constraints to guarantee volumetrically consistent solids.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16813","ref_index":124,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"QuadLink: Autoregressive Quad-Dominant Mesh Generation via Point-Relation Learning","primary_cat":"cs.GR","submitted_at":"2026-05-16T05:04:10+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14594","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TOPOS: High-Fidelity and Efficient Industry-Grade 3D Head Generation","primary_cat":"cs.CV","submitted_at":"2026-05-14T09:02:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TOPOS creates high-fidelity 3D heads with fixed industry topology from single images via a specialized VAE with Perceiver Resampler and a rectified flow transformer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14462","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Real2Sim in HOI: Toward Physically Plausible HOI Reconstruction from Monocular Videos","primary_cat":"cs.CV","submitted_at":"2026-05-14T06:56:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HA-HOI produces physically plausible 4D HOI animations from monocular videos by anchoring object reconstruction to human motion and refining the result in a physics-based humanoid-object simulator.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10922","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pixal3D: Pixel-Aligned 3D Generation from Images","primary_cat":"cs.CV","submitted_at":"2026-05-11T17:55:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pixal3D performs pixel-aligned 3D generation from images via back-projected multi-scale feature volumes, achieving fidelity close to reconstruction while supporting multi-view and scene synthesis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10645","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GenMed: A Pairwise Generative Reformulation of Medical Diagnostic Tasks","primary_cat":"cs.CV","submitted_at":"2026-05-11T14:32:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GenMed uses diffusion models to capture P(X,Y) for medical tasks and performs inference via gradient-based test-time optimization, supporting arbitrary observation combinations without retraining.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Regardless of the underlying representation, recent shape- completion approaches share a common paradigm: the incom- plete shape is treated as a prompt and incorporated into the training process of the diffusion model [42], [43]. However, as in the segmentation case, this relies heavily on the specific format of the prompt [39] and requires large-scale datasets for effective learning [44], [45]. In medical settings, obtaining the necessary large-scale, well-curated, and clean datasets is particularly difficult [11], [46]. Although recent efforts such as MedShapeNet [9] have produced large-scale shape datasets for the medical domain, the inherent difficulty of collecting and curating medical data makes it challenging to achieve per- category sample sizes comparable to natural image datasets"},{"citing_arxiv_id":"2605.07971","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DVD: Discrete Voxel Diffusion for 3D Generation and Editing","primary_cat":"cs.CV","submitted_at":"2026-05-08T16:32:17+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[4] Xianghui Yang, Huiwen Shi, Bowen Zhang, Fan Yang, Jiacheng Wang, Hongxu Zhao, Xinhai Liu, Xinzhou Wang, Qingxiang Lin, Jiaao Yu, Lifu Wang, Jing Xu, Zebin He, Zhuo Chen, Sicong Liu, Junta Wu, Yihang Lian, Shaoxiong Yang, Yuhong Liu, Yong Yang, Di Wang, Jie Jiang, and Chunchao Guo. Hunyuan3d 1.0: A unified framework for text-to-3d and image-to-3d generation, 2025. URLhttps://arxiv.org/abs/2411.02293. [5] Zibo Zhao, Zeqiang Lai, Qingxiang Lin, Yunfei Zhao, Haolin Liu, Shuhui Yang, Yifei Feng, Mingxin Yang, Sheng Zhang, Xianghui Yang, Huiwen Shi, Sicong Liu, Junta Wu, Yihang Lian, Fan Yang, Ruining Tang, Zebin He, Xinzhou Wang, Jian Liu, Xuhui Zuo, Zhuo Chen, Biwen Lei, Haohan Weng, Jing Xu, Yiling Zhu, Xinhai Liu, Lixin Xu, Changrong Hu, Shaoxiong"},{"citing_arxiv_id":"2605.04412","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Structured 3D Latents Are Surprisingly Powerful: Unleashing Generalizable Style with 2D Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-06T02:08:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DiLAST optimizes 3D latents via guidance from a 2D diffusion model to enable generalizable style transfer for OOD styles in 3D asset generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01171","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CADFit: Precise Mesh-to-CAD Program Generation with Hybrid Optimization","primary_cat":"cs.CV","submitted_at":"2026-05-02T00:19:52+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.28134","ref_index":86,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MeshReGen: A Unified 3D Geometry Regeneration Framework","primary_cat":"cs.CV","submitted_at":"2026-04-30T17:18:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MeshReGen introduces a conditioned 3D geometry regenerator with VecSet that learns a regeneration prior via self-supervision and reports state-of-the-art results on controllable generation tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"images of the same object, which are used to reconstruct through 3D volumetric representations such as radiance fields, tri-plane features, or Gaussian splats, from which a 3D mesh can be extracted using Marching Cubes [39]. However, the lack of explicit volumetric constraints can result in open surfaces or inaccurate interiors. In contrast, methods with direct 3D supervision [7,24,28,31,56,66,86, 87] can explicitly generate SDFs, from which 3D meshes can be cleanly extracted with arbitrary topology. Most 3D generators take text or image prompts as input, which are intu- itive but inherently under-constrained. To improve controllability, recent works introduce explicit 3D-aware controls into native 3D generators, such as voxels, boundingboxes,orotherstructuredprimitives(e."},{"citing_arxiv_id":"2604.26917","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AnimateAnyMesh++: A Flexible 4D Foundation Model for High-Fidelity Text-Driven Mesh Animation","primary_cat":"cs.CV","submitted_at":"2026-04-29T17:27:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"AnimateAnyMesh++ animates arbitrary 3D meshes from text using an expanded 300K-identity DyMesh-XL dataset, a power-law topology-aware DyMeshVAE-Flex, and a variable-length rectified-flow generator to produce semantically accurate, temporally coherent animations in seconds.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Zhu, \"Lpm: Efficient 3d content creation from single image by large-scale partial 3d modeling,\"IEEE Transactions on Circuits and Systems for Video Technology, 2025. [46] X. Yang, H. Shi, B. Zhang, F. Yang, J. Wang, H. Zhao, X. Liu, X. Wang, Q. Lin, J. Yuet al., \"Hunyuan3d 1.0: A unified framework for text-to-3d and image-to-3d generation,\"arXiv preprint arXiv:2411.02293, 2024. [47] Z. Zhao, Z. Lai, Q. Lin, Y . Zhao, H. Liu, S. Yang, Y . Feng, M. Yang, S. Zhang, X. Yanget al., \"Hunyuan3d 2.0: Scaling diffusion mod- els for high resolution textured 3d assets generation,\"arXiv preprint arXiv:2501.12202, 2025. [48] J. Xiang, Z. Lv, S. Xu, Y . Deng, R. Wang, B. Zhang, D. Chen, X. Tong, and J. Yang, \"Structured 3d latents for scalable and versatile"},{"citing_arxiv_id":"2604.23629","ref_index":6,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Visual Synthesis to Interactive Worlds: Toward Production-Ready 3D Asset Generation","primary_cat":"cs.GR","submitted_at":"2026-04-26T09:44:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper surveys 3D asset generation methods and organizes them around the full production pipeline to assess which outputs meet engine-level requirements for interactive applications.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"In practice, converting a generated 3D object into a production-ready asset involves retopology, UV layout, material authoring, and rigging, a sequence of labor-intensive steps that collectively constitute what we term the assetization bottleneck. Game engines such as Unreal Engine and Unity impose the strictest requirements on 3D assets, offering the clearest definition of whatproduction-readyconcretely means [6, 26]. An asset reaches production readi- ness only when it can be deployed directly in an engine without manual repair, which requires:(1) Manifold mesh topology-watertight surfaces with structured edge loops suitable for deformation and LOD man- agement [23, 26, 28];(2) Non-overlapping UV parameterization-a distortion-controlled atlas where every texel maps unambiguously to a surface point [29];(3) Disentangled PBR materials-illumination-"},{"citing_arxiv_id":"2604.21592","ref_index":65,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Sculpt4D: Generating 4D Shapes via Sparse-Attention Diffusion Transformers","primary_cat":"cs.CV","submitted_at":"2026-04-23T12:18:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sculpt4D generates temporally coherent 4D shapes by integrating a block sparse attention mechanism with time-decaying mask into a pretrained 3D diffusion transformer, achieving SOTA results with 56% less computation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21575","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OmniFit: Multi-modal 3D Body Fitting via Scale-agnostic Dense Landmark Prediction","primary_cat":"cs.CV","submitted_at":"2026-04-23T11:55:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniFit uses a conditional transformer decoder to predict dense body landmarks from multi-modal inputs for scale-agnostic SMPL-X fitting, outperforming prior methods by 57-81% and reaching millimeter accuracy on CAPE and 4D-DRESS benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13862","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Seed3D 2.0: Advancing High-Fidelity Simulation-Ready 3D Content Generation","primary_cat":"cs.GR","submitted_at":"2026-04-22T17:50:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Seed3D 2.0 advances 3D content generation via a coarse-to-fine geometry pipeline, unified PBR material model, and simulation-ready scene tools, reporting 69-89.9% win rates over commercial systems in human studies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18468","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Asset Harvester: Extracting 3D Assets from Autonomous Driving Logs for Simulation","primary_cat":"cs.CV","submitted_at":"2026-04-20T16:20:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Asset Harvester converts sparse in-the-wild object observations from AV driving logs into complete simulation-ready 3D assets via data curation, geometry-aware preprocessing, and a SparseViewDiT model that couples sparse-view multiview generation with 3D Gaussian lifting.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"We then present qualitative results demonstrating in-the-wild performance, out- of-distribution image generalization, pedestrian animation with our assets, and asset insertion and simulation in NuRec. 5.1. Quantitative Evaluation and Analysis We compareAsset Harvesteragainst representative image-to-3D baselines, including SAM3D [28], TRELLIS [29], and Hunyuan3D [30, 31], on the NuRec AV Object Benchmark. Comparison Setup. For a fair comparison, we use a single RGB input view for all methods. For Asset Harvester (AH), we report two single-view settings: (1) without camera metadata, where we estimate camera parameters using a linear- probing MLP on top of C-Radio features (trained on our training split), and (2) with camera parameters parsed"},{"citing_arxiv_id":"2604.22828","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MetaEarth3D: Unlocking World-scale 3D Generation with Spatially Scalable Generative Modeling","primary_cat":"cs.CV","submitted_at":"2026-04-19T15:09:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MetaEarth3D is the first generative foundation model for spatially consistent, unbounded 3D scene generation at planetary scale using optical Earth observation data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15058","ref_index":90,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"\"From remembering to shaping\": Narrating Shared Experiences by Co-Designing Cultural Heritage Artifacts in Collaborative VR","primary_cat":"cs.HC","submitted_at":"2026-04-16T14:25:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A collaborative VR workflow with GenAI lets users merge prompts and creatively repurpose outputs to co-create 3D artifacts that narrate shared cultural heritage experiences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14556","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Controllable Video Object Insertion via Multiview Priors","primary_cat":"cs.CV","submitted_at":"2026-04-16T02:39:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A multi-view prior-based framework for video object insertion that uses dual-path conditioning and an integration-aware consistency module to improve appearance stability and occlusion handling.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"Zhou, Wente Wang, Wenting Shen, Wenyuan Yu, Xianzhong Shi, Xiaoming Huang, Xin Xu, Yan Kou, Yangyu Lv, Yifei Li, Yijing Liu, Yiming Wang, Yingya Zhang, Yitong Huang, Yong Li, You Wu, Yu Liu, Yulin Pan, Yun Zheng, Yuntao Hong, Yupeng Shi, Yutong Feng, Zeyinzi Jiang, Zhen Han, Zhi-Fan Wu, and Ziyu Liu. 2025. Wan: Open and Advanced Large-Scale Video Generative Models.arXiv preprint arXiv:2503.20314(2025). [39] Wen Wang, Yan Jiang, Kangyang Xie, Zide Liu, Hao Chen, Yue Cao, Xinlong Wang, and Chunhua Shen. 2023. Zero-shot video editing using off-the-shelf image diffusion models.arXiv preprint arXiv:2303.17599(2023). [40] Xiang Wang, Hangjie Yuan, Shiwei Zhang, Dayou Chen, Jiuniu Wang, Yingya Zhang, Yujun Shen, Deli Zhao, and Jingren Zhou. 2023. Videocomposer: Composi-"},{"citing_arxiv_id":"2604.14302","ref_index":59,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Geometrically Consistent Multi-View Scene Generation from Freehand Sketches","primary_cat":"cs.CV","submitted_at":"2026-04-15T18:00:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A framework generates consistent multi-view scenes from one freehand sketch via a ~9k-sample dataset, Parallel Camera-Aware Attention Adapters, and Sparse Correspondence Supervision Loss, outperforming baselines in realism and consistency.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"improve consistency by generating multiple views jointly through synchronised diffusion or multi-view attention. A parallel line of work repurposes video dif- fusion models, treating the temporal axis as a viewpoint axis, for object-centric 4 Bourouis et al. orbiting [8,44] or scene-level [12] generation. More recent DiT-based architec- tures push the scale further: Hunyuan3D2.0 [59] trains a 2B-parameter shape- generation DiT, TRELLIS [51] employs rectified flow transformers over struc- tured 3D latents, and See3D [34] learns 3D priors from 16M video clips without pose annotations. How camera geometry is communicated to the generative model is a critical design axis. Absolute conditioning methods [13,29,50] inject camera parameters"},{"citing_arxiv_id":"2604.13863","ref_index":55,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PostureObjectstitch: Anomaly Image Generation Considering Assembly Relationships in Industrial Scenarios","primary_cat":"cs.CV","submitted_at":"2026-04-15T13:29:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PostureObjectStitch generates assembly-aware anomaly images by decoupling multi-view features into high-frequency, texture and RGB components, modulating them temporally in a diffusion model, and applying conditional loss plus geometric priors to preserve correct component relationships.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13688","ref_index":93,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Voxel 3D Editing: Learning from 3D Masks and Self-Constructed Data","primary_cat":"cs.CV","submitted_at":"2026-04-15T10:10:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BVE framework enables text-guided 3D editing beyond voxel limits by combining self-constructed data, lightweight semantic injection, and annotation-free masking to preserve local invariance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12309","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Towards Realistic and Consistent Orbital Video Generation via 3D Foundation Priors","primary_cat":"cs.CV","submitted_at":"2026-04-14T05:35:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A video generation approach conditions a base model with multi-scale 3D latent features and a cross-attention adapter to produce geometrically realistic and consistent orbital videos from one image.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11808","ref_index":33,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pair2Scene: Learning Local Object Relations for Procedural Scene Generation","primary_cat":"cs.CV","submitted_at":"2026-04-13T17:59:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pair2Scene generates complex 3D scenes beyond training data by training a network on local object-pair placement rules and applying them recursively with collision-aware sampling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11331","ref_index":96,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Any 3D Scene is Worth 1K Tokens: 3D-Grounded Representation for Scene Generation at Scale","primary_cat":"cs.CV","submitted_at":"2026-04-13T11:32:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A 3D-grounded autoencoder and diffusion transformer allow direct generation of 3D scenes in an implicit latent space using a fixed 1K-token representation for arbitrary views and resolutions.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"tain explicit 3D memory based on historical frames, and condition subsequent generation on historical renderings to enforce coherence. Despite these efforts, all such methods fundamentally operate in a 2D latent space, which introduces substantial representation redundancy and fails to ensure spatial consistency at the 3D level. Another line of work [77,96] focuses on object-level 3D generation. Leveraging massive high-quality 3D asset data, these methods achieve 3D gen- eration in explicit 3D latent space based on point clouds or voxels. However, for scene-level generation, such high-quality 3D data is unavailable. How to achieve 3D scene generation using only multi-view image data remains a challenge."},{"citing_arxiv_id":"2604.08983","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AssemLM: A Spatial Reasoning Multimodal Large Language Model for Robotic Assembly","primary_cat":"cs.RO","submitted_at":"2026-04-10T05:43:39+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"other","top_context_polarity":"unclear","context_text":"large language models to understand point clouds. In European Conference on Computer Vision, pages 131- 147. Springer, 2024. [49] Jiahao Zhang, Anoop Cherian, Cristian Rodriguez, Wei- jian Deng, and Stephen Gould. Manual-pa: Learning 3d part assembly from instruction diagrams. InProceedings of the IEEE/CVF International Conference on Computer Vision, pages 6304-6314, 2025. [50] Zibo Zhao, Zeqiang Lai, Qingxiang Lin, Yunfei Zhao, Haolin Liu, Shuhui Yang, Yifei Feng, Mingxin Yang, Sheng Zhang, Xianghui Yang, et al. Hunyuan3d 2.0: Scaling diffusion models for high resolution textured 3d assets generation.arXiv preprint arXiv:2501.12202, 2025. [51] Enshen Zhou, Jingkun An, Cheng Chi, Yi Han, Shanyu Rong, Chi Zhang, Pengwei Wang, Zhongyuan Wang,"},{"citing_arxiv_id":"2604.07273","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GenLCA: 3D Diffusion for Full-Body Avatars from In-the-Wild Videos","primary_cat":"cs.CV","submitted_at":"2026-04-08T16:34:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GenLCA enables scalable training of a 3D diffusion model for photorealistic, animatable full-body avatars by tokenizing large-scale real-world videos with a pretrained reconstructor and applying visibility-aware diffusion training to handle partial observations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04707","ref_index":162,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OpenWorldLib: A Unified Codebase and Definition of Advanced World Models","primary_cat":"cs.CV","submitted_at":"2026-04-06T14:19:48+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"InProceedings of the IEEE/CVF International Conference on Computer Vision, pages 10612-10623, 2025. [161] Shanshan Zhao, Xinjie Zhang, Jintao Guo, Jiakui Hu, Lunhao Duan, Minghao Fu, Yong Xien Chng, Guo-Hua Wang, Qing-Guo Chen, Zhao Xu, et al. Unified multimodal understanding and generation models: Advances, challenges, and opportunities.arXiv preprint arXiv:2505.02567, 2025. [162] Zibo Zhao, Zeqiang Lai, Qingxiang Lin, Yunfei Zhao, Haolin Liu, Shuhui Yang, Yifei Feng, Mingxin Yang, Sheng Zhang, Xianghui Yang, et al. Hunyuan3d 2.0: Scaling diffusion models for high resolution textured 3d assets generation. arXiv preprint arXiv:2501.12202, 2025. OpenDCAI Technical Report 26 [163] Duo Zheng, Shijia Huang, Yanyang Li, and Liwei Wang."},{"citing_arxiv_id":"2604.02764","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"InverseDraping: Recovering Sewing Patterns from 3D Garment Surfaces via BoxMesh Bridging","primary_cat":"cs.CV","submitted_at":"2026-04-03T06:19:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A two-stage autoregressive framework centered on BoxMesh recovers parametric sewing patterns from 3D garment surfaces, claiming state-of-the-art results on benchmarks and generalization to real scans and single-view images.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"our method achieves state-of-the-art performance. On real- world data, it generalizes well to high-quality human scans from THuman2.0 [13] and RenderPeople 1, as well as to clothed human meshes captured using our cost-effective multi- view setup with commodity devices such as mobile phones. Furthermore, due to its generality, our framework can be combined with methods such as Hunyuan3D [14] to enable single-view reconstruction. In this setting, our method achieves performance comparable to state-of-the-art approaches [15], [16], while in some cases better preserving fidelity to the input image (see Fig. 1). The main contributions of our work are summarized as follows: • We show that introducing a structured intermediate repre- sentation is key to addressing the ill-posed nature of sewing"},{"citing_arxiv_id":"2604.01479","ref_index":109,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniRecGen: Unifying Multi-View 3D Reconstruction and Generation","primary_cat":"cs.CV","submitted_at":"2026-04-01T23:35:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniRecGen unifies reconstruction and generation via shared canonical space and disentangled cooperative learning to produce complete, consistent 3D models from sparse views.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.11633","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MV-SAM3D: Adaptive Multi-View Fusion for Layout-Aware 3D Generation","primary_cat":"cs.CV","submitted_at":"2026-03-12T07:53:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MV-SAM3D adds multi-view fusion via multi-diffusion with attention-entropy and visibility weighting plus physics-aware optimization to improve fidelity and physical plausibility in layout-aware 3D generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.04876","ref_index":62,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PerpetualWonder: Long-Horizon Action-Conditioned 4D Scene Generation","primary_cat":"cs.CV","submitted_at":"2026-02-04T18:58:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PerpetualWonder introduces a closed-loop generative simulator with a unified physical-visual representation for long-horizon action-conditioned 4D scene generation from one image.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.04349","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VecSet-Edit: Unleashing Pre-trained LRM for Mesh Editing from Single Image","primary_cat":"cs.CV","submitted_at":"2026-02-04T09:16:12+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.21798","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CG-MLLM: Captioning and Generating 3D content via Multi-modal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-01-29T14:42:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CG-MLLM is a multimodal LLM using a Mixture-of-Transformer architecture with separate TokenAR and BlockAR components integrated with a pre-trained vision-language backbone and 3D VAE to enable 3D captioning and high-fidelity generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.11194","ref_index":64,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ATATA: One Algorithm to Align Them All","primary_cat":"cs.CV","submitted_at":"2026-01-16T11:11:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ATATA enables fast joint inference of structurally aligned pairs using Rectified Flow models via segment transport, improving state-of-the-art for image and video generation while matching 3D quality at much higher speed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.17445","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LangDriveCTRL: Natural Language Controllable Driving Scene Editing with Multi-modal Agents","primary_cat":"cs.CV","submitted_at":"2025-12-19T10:57:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LangDriveCTRL decomposes driving videos into 3D scene graphs and uses an agentic pipeline with specialized multi-modal agents to perform language-controlled object and behavior edits, achieving nearly 2x higher instruction alignment than prior state-of-the-art methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.07435","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DreamLifting: A Plug-in Module Lifting MV Diffusion Models for 3D Asset Generation","primary_cat":"cs.CV","submitted_at":"2025-09-09T06:43:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LGAA is a modular adapter framework that lifts multi-view diffusion models to produce 2D Gaussian Splats with PBR channels for high-quality relightable 3D mesh extraction using data-efficient finetuning on 69k instances.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.16504","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hunyuan3D 2.5: Towards High-Fidelity 3D Assets Generation with Ultimate Details","primary_cat":"cs.CV","submitted_at":"2025-06-19T17:57:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Hunyuan3D 2.5's LATTICE model with 10B parameters generates detailed 3D shapes from images and uses multi-view PBR for textures, outperforming prior methods in fidelity and mesh quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.15442","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hunyuan3D 2.1: From Images to High-Fidelity 3D Assets with Production-Ready PBR Material","primary_cat":"cs.CV","submitted_at":"2025-06-18T13:14:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Hunyuan3D 2.1 is a two-part system with DiT for shape generation and Paint for texture synthesis that produces high-fidelity 3D assets with PBR materials.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.22394","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PacTure: Efficient PBR Texture Generation on Packed Views with Visual Autoregressive Models","primary_cat":"cs.CV","submitted_at":"2025-05-28T14:23:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PacTure uses view packing and next-scale autoregressive prediction to generate consistent multi-view PBR textures faster than prior sequential or cross-attention methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}