{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T00:48:00Z","timestamp":1742950080449,"version":"3.40.3"},"publisher-location":"Cham","reference-count":63,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031732256"},{"type":"electronic","value":"9783031732263"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73226-3_13","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T15:02:57Z","timestamp":1730386977000},"page":"217-235","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Diverse Text-to-3D Synthesis with\u00a0Augmented Text Embedding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-4950-5235","authenticated-orcid":false,"given":"Uy Dieu","family":"Tran","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1756-2054","authenticated-orcid":false,"given":"Minh","family":"Luu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9678-0886","authenticated-orcid":false,"given":"Phong Ha","family":"Nguyen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9259-420X","authenticated-orcid":false,"given":"Khoi","family":"Nguyen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5706-8634","authenticated-orcid":false,"given":"Binh-Son","family":"Hua","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"13_CR1","unstructured":"Bishop, C.M.: Pattern Recognition and Machine Learning, pp. 469\u2013470. Springer (2006)"},{"key":"13_CR2","volume-title":"Blender - a 3D Modelling and Rendering Package","author":"Blender Online Community","year":"2018","unstructured":"Blender Online Community: Blender - a 3D Modelling and Rendering Package. Blender Foundation, Blender Institute, Amsterdam (2018)"},{"key":"13_CR3","unstructured":"Brock, A., Donahue, J., Simonyan, K.: Large scale GAN training for high fidelity natural image synthesis. In: International Conference on Learning Representations (2019)"},{"key":"13_CR4","doi-asserted-by":"crossref","unstructured":"Chan, E.R., et\u00a0al.: Efficient geometry-aware 3D generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16123\u201316133 (2022)","DOI":"10.1109\/CVPR52688.2022.01565"},{"key":"13_CR5","unstructured":"Chang, A.X., et\u00a0al.: Shapenet: an information-rich 3d model repository. arXiv preprint arXiv:1512.03012 (2015)"},{"key":"13_CR6","unstructured":"Chang, H., Zhang, H., et\u00a0al.: Muse: text-to-image generation via masked generative transformers. arXiv preprint arXiv:2301.00704 (2023)"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Chen, A., Xu, Z., Geiger, A., Yu, J., Su, H.: Tensorf: tensorial radiance fields. In: European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-19824-3_20"},{"key":"13_CR8","doi-asserted-by":"crossref","unstructured":"Chen, R., Chen, Y., Jiao, N., Jia, K.: Fantasia3D: disentangling geometry and appearance for high-quality text-to-3D content creation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (October 2023)","DOI":"10.1109\/ICCV51070.2023.02033"},{"key":"13_CR9","unstructured":"Chen, Y., Li, Z., Liu, P.: Et3d: efficient text-to-3D generation via multi-view distillation (2023)"},{"key":"13_CR10","doi-asserted-by":"crossref","unstructured":"Deitke, M., et al.: Objaverse: a universe of annotated 3D objects. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13142\u201313153 (2023)","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"13_CR11","unstructured":"Gal, R., et al.: An image is worth one word: personalizing text-to-image generation using textual inversion. In: The Eleventh International Conference on Learning Representations (2023)"},{"key":"13_CR12","doi-asserted-by":"crossref","unstructured":"Garbin, S.J., Kowalski, M., Johnson, M., Shotton, J., Valentin, J.: Fastnerf: high-fidelity neural rendering at 200fps. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14346\u201314355 (2021)","DOI":"10.1109\/ICCV48922.2021.01408"},{"key":"13_CR13","unstructured":"Guo, Y.C., et al.: threestudio: a unified framework for 3d content generation (2023)"},{"key":"13_CR14","unstructured":"Han, I., Yang, S., Kwon, T., Ye, J.C.: Highly personalized text embedding for image manipulation by stable diffusion. arXiv preprint arXiv:2303.08767 (2023)"},{"key":"13_CR15","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: Gans trained by a two time-scale update rule converge to a local nash equilibrium. In: Guyon, I., Luxburg, U.V., Bengio, S., Wallach, H., Fergus, R., Vishwanathan, S., Garnett, R. (eds.) Advances in Neural Information Processing Systems. vol.\u00a030. Curran Associates, Inc. (2017)"},{"key":"13_CR16","unstructured":"Hong, F., et al.: 3dtopia: large text-to-3d generation model with hybrid diffusion priors. arXiv preprint arXiv:2403.02234 (2024)"},{"key":"13_CR17","unstructured":"Hong, Y., et al.: LRM: large reconstruction model for single image to 3D. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"13_CR18","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. In: International Conference on Learning Representations (2022)"},{"key":"13_CR19","doi-asserted-by":"crossref","unstructured":"Huang, T., et al.: Dreamcontrol: control-based text-to-3d generation with 3D self-prior. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5364\u20135373 (June 2024)","DOI":"10.1109\/CVPR52733.2024.00513"},{"key":"13_CR20","unstructured":"Huang, Y., Wang, J., Shi, Y., Tang, B., Qi, X., Zhang, L.: Dreamtime: an improved optimization strategy for diffusion-guided 3D generation. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"13_CR21","doi-asserted-by":"crossref","unstructured":"Jain, A., Mildenhall, B., Barron, J.T., Abbeel, P., Poole, B.: Zero-shot text-guided object generation with dream fields. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 867\u2013876 (2022)","DOI":"10.1109\/CVPR52688.2022.00094"},{"key":"13_CR22","unstructured":"Jun, H., Nichol, A.: Shap-e: Generating conditional 3d implicit functions (2023). https:\/\/arxiv.org\/abs\/2305.02463"},{"key":"13_CR23","doi-asserted-by":"crossref","unstructured":"Kang, M., et al.: Scaling up gans for text-to-image synthesis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00976"},{"key":"13_CR24","unstructured":"Katzir, O., Patashnik, O., Cohen-Or, D., Lischinski, D.: Noise-free score distillation. arXiv preprint arXiv:2310.17590 (2023)"},{"key":"13_CR25","doi-asserted-by":"crossref","unstructured":"Kerbl, B., Kopanas, G., Leimk\u00fchler, T., Drettakis, G.: 3D gaussian splatting for real-time radiance field rendering. ACM Trans. Graph. 42(4) (July 2023)","DOI":"10.1145\/3592433"},{"key":"13_CR26","unstructured":"Lee, K., Sohn, K., Shin, J.: Dreamflow: High-quality text-to-3D generation by approximating probability flow. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"13_CR27","unstructured":"Li, J., et al.: Instant3d: fast text-to-3D with sparse-view generation and large reconstruction model. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"13_CR28","doi-asserted-by":"crossref","unstructured":"Lin, C.H., et al.: Magic3d: high-resolution text-to-3D content creation. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"13_CR29","unstructured":"Liu, M., Xu, C., Jin, H., Chen, L., Xu, Z., Su, H., et\u00a0al.: One-2-3-45: any single image to 3D mesh in 45 seconds without per-shape optimization. arXiv preprint arXiv:2306.16928 (2023)"},{"key":"13_CR30","doi-asserted-by":"crossref","unstructured":"Liu, R., Wu, R., Van\u00a0Hoorick, B., Tokmakov, P., Zakharov, S., Vondrick, C.: Zero-1-to-3: Zero-shot one image to 3d object. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 9298\u20139309 (October 2023)","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"13_CR31","unstructured":"Liu, Y., et al.: Syncdreamer: learning to generate multiview-consistent images from a single-view image. arXiv preprint arXiv:2309.03453 (2023)"},{"key":"13_CR32","doi-asserted-by":"crossref","unstructured":"Lorraine, J., et al.: Att3d: amortized text-to-3D object synthesis. arXiv preprint arXiv:2306.07349 (2023)","DOI":"10.1109\/ICCV51070.2023.01645"},{"key":"13_CR33","doi-asserted-by":"crossref","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: Nerf: Representing scenes as neural radiance fields for view synthesis. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58452-8_24"},{"issue":"4","key":"13_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3528223.3530127","volume":"41","author":"T M\u00fcller","year":"2022","unstructured":"M\u00fcller, T., Evans, A., Schied, C., Keller, A.: Instant neural graphics primitives with a multiresolution hash encoding. ACM Trans. Graph. (ToG) 41(4), 1\u201315 (2022)","journal-title":"ACM Trans. Graph. (ToG)"},{"key":"13_CR35","doi-asserted-by":"publisher","unstructured":"Nguyen-Ha, P., Sarafianos, N., Lassner, C., Heikkil\u00e4, J., Tung, T.: Free-viewpoint rgb-d human performance capture and rendering. In: European Conference on Computer Vision, pp. 473\u2013491. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-19787-1_27","DOI":"10.1007\/978-3-031-19787-1_27"},{"key":"13_CR36","unstructured":"OpenAI: Gpt-4 technical report. ArXiv:2303.08774 (2023)"},{"key":"13_CR37","unstructured":"Poole, B., Jain, A., Barron, J.T., Mildenhall, B.: Dreamfusion: Text-to-3d using 2d diffusion. In: The Eleventh International Conference on Learning Representations (2023)"},{"key":"13_CR38","unstructured":"Qian, G., et al.: Atom: Amortized text-to-mesh using 2D diffusion (2024)"},{"key":"13_CR39","unstructured":"Qian, G., et al.: Magic123: One image to high-quality 3D object generation using both 2D and 3D diffusion priors. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"13_CR40","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR (18\u201324 Jul 2021)"},{"key":"13_CR41","doi-asserted-by":"crossref","unstructured":"Raj, A., et al.: Dreambooth3D: subject-driven text-to-3D generation. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00223"},{"key":"13_CR42","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8821\u20138831. PMLR (18\u201324 Jul 2021)"},{"key":"13_CR43","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"13_CR44","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"13_CR45","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. In: Oh, A.H., Agarwal, A., Belgrave, D., Cho, K. (eds.) Advances in Neural Information Processing Systems (2022)"},{"key":"13_CR46","unstructured":"Sauer, A., Karras, T., Laine, S., Geiger, A., Aila, T.: StyleGAN-t: unlocking the power of GANs for fast large-scale text-to-image synthesis. In: Krause, A., Brunskill, E., Cho, K., Engelhardt, B., Sabato, S., Scarlett, J. (eds.) Proceedings of the 40th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0202, pp. 30105\u201330118. PMLR (23\u201329 Jul 2023)"},{"key":"13_CR47","unstructured":"Seo, J., et al.: Let 2d diffusion model know 3d-consistency for robust text-to-3D generation. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"13_CR48","unstructured":"Shi, Y., Wang, P., Ye, J., Mai, L., Li, K., Yang, X.: MVDream: multi-view diffusion for 3D generation. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"13_CR49","doi-asserted-by":"crossref","unstructured":"Shi, Z., Zhou, X., Qiu, X., Zhu, X.: Improving image captioning with better use of caption. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 7454\u20137464 (2020)","DOI":"10.18653\/v1\/2020.acl-main.664"},{"key":"13_CR50","unstructured":"Shi, Z., Peng, S., Xu, Y., Geiger, A., Liao, Y., Shen, Y.: Deep generative models on 3D representations: a survey. arXiv preprint arXiv:2210.15663 (2023)"},{"key":"13_CR51","doi-asserted-by":"crossref","unstructured":"Sun, C., Sun, M., Chen, H.T.: Direct voxel grid optimization: super-fast convergence for radiance fields reconstruction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5459\u20135469 (2022)","DOI":"10.1109\/CVPR52688.2022.00538"},{"key":"13_CR52","doi-asserted-by":"crossref","unstructured":"Tang, J., Chen, Z., Chen, X., Wang, T., Zeng, G., Liu, Z.: Lgm: large multi-view gaussian model for high-resolution 3d content creation (2024)","DOI":"10.1007\/978-3-031-73235-5_1"},{"key":"13_CR53","unstructured":"Tang, J., Ren, J., Zhou, H., Liu, Z., Zeng, G.: Dreamgaussian: generative Gaussian splatting for efficient 3D content creation. arXiv preprint arXiv:2309.16653 (2023)"},{"key":"13_CR54","doi-asserted-by":"crossref","unstructured":"Tewari, A., et\u00a0al.: Advances in neural rendering. In: Computer Graphics Forum, vol.\u00a041, pp. 703\u2013735. Wiley Online Library (2022)","DOI":"10.1111\/cgf.14507"},{"key":"13_CR55","doi-asserted-by":"crossref","unstructured":"Wang, P., et al.: Taming mode collapse in score distillation for text-to-3D generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9037\u20139047 (June 2024)","DOI":"10.1109\/CVPR52733.2024.00863"},{"key":"13_CR56","unstructured":"Wang, Z., et al.: Prolificdreamer: high-fidelity and diverse text-to-3D generation with variational score distillation. In: NeurIPS (2023)"},{"key":"13_CR57","unstructured":"Weng, H., et al.: Consistent123: Improve consistency for one image to 3D object synthesis. arXiv preprint arXiv:2310.08092 (2023)"},{"key":"13_CR58","doi-asserted-by":"crossref","unstructured":"Wiles, O., Gkioxari, G., Szeliski, R., Johnson, J.: Synsin: end-to-end view synthesis from a single image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7467\u20137477 (2020)","DOI":"10.1109\/CVPR42600.2020.00749"},{"key":"13_CR59","doi-asserted-by":"crossref","unstructured":"Yu, A., Li, R., Tancik, M., Li, H., Ng, R., Kanazawa, A.: Plenoctrees for real-time rendering of neural radiance fields. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5752\u20135761 (2021)","DOI":"10.1109\/ICCV48922.2021.00570"},{"key":"13_CR60","unstructured":"Yu, J., et al.: Scaling autoregressive models for content-rich text-to-image generation. Transactions on Machine Learning Research (2022)"},{"key":"13_CR61","unstructured":"Yu, X., Guo, Y.C., Li, Y., Liang, D., Zhang, S.H., QI, X.: Text-to-3D with classifier score distillation. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"13_CR62","unstructured":"Zhang, H., et al.: DINO: DETR with improved denoising anchor boxes for end-to-end object detection. In: The Eleventh International Conference on Learning Representations (2023)"},{"key":"13_CR63","unstructured":"Zhu, J., Zhuang, P., Koyejo, S.: HIFA: High-fidelity text-to-3D generation with advanced diffusion guidance. In: The Twelfth International Conference on Learning Representations (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73226-3_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T15:15:32Z","timestamp":1730387732000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73226-3_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9783031732256","9783031732263"],"references-count":63,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73226-3_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}