{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T12:12:23Z","timestamp":1743768743286,"version":"3.28.0"},"reference-count":33,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,6,16]]},"DOI":"10.1109\/cvpr52733.2024.00648","type":"proceedings-article","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T17:34:53Z","timestamp":1726508093000},"page":"6786-6795","source":"Crossref","is-referenced-by-count":1,"title":["JeDi: Joint-Image Diffusion Models for Finetuning-Free Personalized Text-to-Image Generation"],"prefix":"10.1109","author":[{"given":"Yu","family":"Zeng","sequence":"first","affiliation":[{"name":"Johns Hopkins University"}]},{"given":"Vishal M.","family":"Patel","sequence":"additional","affiliation":[{"name":"Johns Hopkins University"}]},{"given":"Haochen","family":"Wang","sequence":"additional","affiliation":[{"name":"TTI -Chicago"}]},{"given":"Xun","family":"Huang","sequence":"additional","affiliation":[{"name":"NVIDIA Research"}]},{"given":"Ting-Chun","family":"Wang","sequence":"additional","affiliation":[{"name":"NVIDIA Research"}]},{"given":"Ming-Yu","family":"Liu","sequence":"additional","affiliation":[{"name":"NVIDIA Research"}]},{"given":"Yogesh","family":"Balaji","sequence":"additional","affiliation":[{"name":"NVIDIA Research"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref2","article-title":"ediffi: Text-to-image diffusion models with an ensemble of expert denoisers","author":"Balaji","year":"2022","journal-title":"arXiv preprint"},{"journal-title":"Improving image generation with better captions","first-page":"264403242","author":"Betker","key":"ref3"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01764"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref6","article-title":"Emu: Enhancing image generation models using photogenic needles in a haystack","author":"Dai","year":"2023","journal-title":"arXiv preprint"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"ref8","article-title":"Diffusion models beat gans on image synthesis","volume-title":"Conference on Neural In-formation Processing Systems","author":"Dhariwal","year":"2021"},{"key":"ref9","article-title":"An image is worth one word: Personalizing text-to-image generation using textual inversion","volume-title":"International Conference on Learning and Representation","author":"Gal","year":"2022"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00673"},{"key":"ref11","article-title":"Classifier-free diffusion guidance","volume-title":"NeurIPS Workshop on Deep Generative Models and Downstream Applications","author":"Ho","year":"2021"},{"key":"ref12","article-title":"Denoising diffusion probabilistic models","volume-title":"Conference on Neural Infor-mation Processing Systems","author":"Ho","year":"2020"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref14","article-title":"Segment any-thing","author":"Kirillov","year":"2023","journal-title":"arXiv preprint"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"ref16","article-title":"Blip-diffusion: Pretrained subject representation for control-lable text-to-image generation and editing","author":"Li","year":"2023","journal-title":"arXiv preprint"},{"key":"ref17","article-title":"Blip-2: Bootstrapping language-image pretraining with frozen image encoders and large language models","author":"Li","year":"2023","journal-title":"arXiv preprint"},{"key":"ref18","article-title":"Grounding dino: Marrying dino with grounded pretraining for open-set object detection","author":"Liu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01117"},{"key":"ref20","article-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models","author":"Nichol","year":"2021","journal-title":"arXiv preprint"},{"volume-title":"OpenAI. Chatgpt","year":"2023","key":"ref21"},{"key":"ref22","article-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis","author":"Podell","year":"2023","journal-title":"arXiv preprint"},{"key":"ref23","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning","author":"Radford","year":"2021"},{"issue":"1","key":"ref24","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"The Journal of Machine Learning Research"},{"key":"ref25","article-title":"Hierarchical text-conditional image generation with clip latents","author":"Ramesh","year":"2022","journal-title":"arXiv preprint"},{"key":"ref26","article-title":"High-resolution image syn-thesis with latent diffusion models","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Rombach","year":"2022"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"ref28","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume-title":"Conference on Neural Information Processing Systems","author":"Saharia","year":"2022"},{"key":"ref29","article-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs","author":"Schuhmann","year":"2021","journal-title":"arXiv preprint"},{"key":"ref30","article-title":"Instant-booth: Personalized text-to-image generation without test-time finetuning","author":"Shi","year":"2023","journal-title":"arXiv preprint"},{"key":"ref31","article-title":"Score-based generative modeling through stochastic differential equations","volume-title":"Proc. ICLR","author":"Song","year":"2021"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01461"},{"key":"ref33","article-title":"Fastcomposer: Tuning-free multi-subject image generation with localized attention","author":"Xiao","year":"2023","journal-title":"arXiv preprint"}],"event":{"name":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","start":{"date-parts":[[2024,6,16]]},"location":"Seattle, WA, USA","end":{"date-parts":[[2024,6,22]]}},"container-title":["2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10654794\/10654797\/10656241.pdf?arnumber=10656241","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,19]],"date-time":"2024-09-19T06:30:20Z","timestamp":1726727420000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10656241\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,16]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/cvpr52733.2024.00648","relation":{},"subject":[],"published":{"date-parts":[[2024,6,16]]}}}