{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T01:58:51Z","timestamp":1740103131643,"version":"3.37.3"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,25]]},"DOI":"10.1145\/3637528.3671640","type":"proceedings-article","created":{"date-parts":[[2024,8,25]],"date-time":"2024-08-25T04:55:12Z","timestamp":1724561712000},"page":"6390-6399","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Bringing Multimodality to Amazon Visual Search System"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4544-2078","authenticated-orcid":false,"given":"Xinliang","family":"Zhu","sequence":"first","affiliation":[{"name":"Amazon.com, Palo Alto, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0244-6335","authenticated-orcid":false,"given":"Sheng-Wei","family":"Huang","sequence":"additional","affiliation":[{"name":"Amazon.com, Palo Alto, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1474-8415","authenticated-orcid":false,"given":"Han","family":"Ding","sequence":"additional","affiliation":[{"name":"Amazon.com, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7004-3570","authenticated-orcid":false,"given":"Jinyu","family":"Yang","sequence":"additional","affiliation":[{"name":"Amazon.com, Palo Alto, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5680-7223","authenticated-orcid":false,"given":"Kelvin","family":"Chen","sequence":"additional","affiliation":[{"name":"Amazon.com, New York, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4312-8050","authenticated-orcid":false,"given":"Tao","family":"Zhou","sequence":"additional","affiliation":[{"name":"Amazon.com, Palo Alto, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0198-240X","authenticated-orcid":false,"given":"Tal","family":"Neiman","sequence":"additional","affiliation":[{"name":"Amazon.com, New York, New York, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1950-8501","authenticated-orcid":false,"given":"Ouye","family":"Xie","sequence":"additional","affiliation":[{"name":"Amazon.com, Seattle, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9206-2916","authenticated-orcid":false,"given":"Son","family":"Tran","sequence":"additional","affiliation":[{"name":"Amazon.com, Palo Alto, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8622-3540","authenticated-orcid":false,"given":"Benjamin","family":"Yao","sequence":"additional","affiliation":[{"name":"Amazon.com, Seattle, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3509-582X","authenticated-orcid":false,"given":"Douglas","family":"Gray","sequence":"additional","affiliation":[{"name":"Amazon.com, Palo Alto, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2571-9950","authenticated-orcid":false,"given":"Anuj","family":"Bindal","sequence":"additional","affiliation":[{"name":"Amazon.com, Palo Alto, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8233-4301","authenticated-orcid":false,"given":"Arnab","family":"Dhua","sequence":"additional","affiliation":[{"name":"Amazon.com, Palo Alto, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,8,24]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"The Eleventh International Conference on Learning Representations (ICLR 2023)","author":"An X.","year":"2023","unstructured":"X. An, J. Deng, K. Yang, J. Li, Z. Feng, J. Guo, J. Yang, and T. Liu. Unicom: Universal and compact representation learning for image retrieval. In The Eleventh International Conference on Learning Representations (ICLR 2023), 2023."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00150"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403311"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.202"},{"key":"e_1_3_2_2_8_1","first-page":"4171","volume-title":"Proceedings of NAACLHLT","volume":"2019","author":"Devlin J.","year":"2018","unstructured":"J. Devlin, M.-W. Chang, K. Lee, and K. Toutanova. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of NAACLHLT, volume 2019, page 4171, 2018."},{"key":"e_1_3_2_2_9_1","volume-title":"International Conference on Learning Representations","author":"Dosovitskiy A.","year":"2020","unstructured":"A. Dosovitskiy, L. Beyer, A. Kolesnikov, D. Weissenborn, X. Zhai, T. Unterthiner, M. Dehghani, M. Minderer, G. Heigold, S. Gelly, et al. An image is worth 16x16 words: Transformers for image recognition at scale. In International Conference on Learning Representations, 2020."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539071"},{"key":"e_1_3_2_2_11_1","volume-title":"Datacomp: In search of the next generation of multimodal datasets","author":"Gadre S. Y.","year":"2023","unstructured":"S. Y. Gadre, G. Ilharco, A. Fang, J. Hayase, G. Smyrnis, T. Nguyen, R. Marten, M. Wortsman, D. Ghosh, J. Zhang, E. Orgad, R. Entezari, G. Daras, S. Pratt, V. Ramanujan, Y. Bitton, K. Marathe, S. Mussmann, R. Vencu, M. Cherti, R. Krishna, P. W. Koh, O. Saukh, A. Ratner, S. Song, H. Hajishirzi, A. Farhadi, R. Beaumont, S. Oh, A. Dimakis, J. Jitsev, Y. Carmon, V. Shankar, and L. Schmidt. Datacomp: In search of the next generation of multimodal datasets, 2023."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.382"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.100"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24261-3_7"},{"key":"e_1_3_2_2_16_1","first-page":"4904","volume-title":"International Conference on Machine Learning","author":"Jia C.","year":"2021","unstructured":"C. Jia, Y. Yang, Y. Xia, Y.-T. Chen, Z. Parekh, H. Pham, Q. Le, Y.-H. Sung, Z. Li, and T. Duerig. Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning, pages 4904--4916. PMLR, 2021."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00330"},{"key":"e_1_3_2_2_18_1","first-page":"12888","volume-title":"International conference on machine learning","author":"Li J.","year":"2022","unstructured":"J. Li, D. Li, C. Xiong, and S. Hoi. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning, pages 12888--12900. PMLR, 2022."},{"key":"e_1_3_2_2_19_1","first-page":"34","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","author":"Li J.","year":"2021","unstructured":"J. Li, R. Selvaraju, A. Gotmare, S. Joty, C. Xiong, and S. C. H. Hoi. Align before fuse: Vision and language representation learning with momentum distillation. Advances in Neural Information Processing Systems, 34, 2021.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_20_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li L. H.","year":"2019","unstructured":"L. H. Li, M. Yatskar, D. Yin, C.-J. Hsieh, and K.-W. Chang. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557, 2019."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"e_1_3_2_2_22_1","volume-title":"Scaling language-image pretraining via masking","author":"Li Y.","year":"2023","unstructured":"Y. Li, H. Fan, R. Hu, C. Feichtenhofer, and K. He. Scaling language-image pretraining via masking, 2023."},{"key":"e_1_3_2_2_23_1","volume-title":"Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm","author":"Li Y.","year":"2022","unstructured":"Y. Li, F. Liang, L. Zhao, Y. Cui, W. Ouyang, J. Shao, F. Yu, and J. Yan. Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm, 2022."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.124"},{"key":"e_1_3_2_2_25_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems, 32","author":"Lu J.","year":"2019","unstructured":"J. Lu, D. Batra, D. Parikh, and S. Lee. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053673"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.47"},{"key":"e_1_3_2_2_28_1","volume-title":"Y. Li, and O. Vinyals. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"A.","year":"2018","unstructured":"A. v. d. Oord, Y. Li, and O. Vinyals. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748, 2018."},{"key":"e_1_3_2_2_29_1","first-page":"8748","volume-title":"International Conference on Machine Learning","author":"Radford A.","year":"2021","unstructured":"A. Radford, J. W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark, et al. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning, pages 8748--8763. PMLR, 2021."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"e_1_3_2_2_32_1","first-page":"25278","article-title":"Laion-5b: An open largescale dataset for training next generation image-text models","volume":"35","author":"Schuhmann C.","year":"2022","unstructured":"C. Schuhmann, R. Beaumont, R. Vencu, C. Gordon, R. Wightman, M. Cherti, T. Coombes, A. Katta, C. Mullis, M. Wortsman, P. Schramowski, S. Kundurthy, K. Crowson, L. Schmidt, R. Kaczmarczyk, and J. Jitsev. Laion-5b: An open largescale dataset for training next generation image-text models. Advances in Neural Information Processing Systems, 35:25278--25294, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2923552"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58586-0_27"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/2812802"},{"key":"e_1_3_2_2_36_1","first-page":"36","article-title":"Image captioners are scalable vision learners too","author":"Tschannen M.","year":"2024","unstructured":"M. Tschannen, M. Kumar, A. Steiner, X. Zhai, N. Houlsby, and L. Beyer. Image captioners are scalable vision learners too. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01269"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00741"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00330"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00516"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3098162"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00052"},{"key":"e_1_3_2_2_45_1","article-title":"Contrastive captioners are image-text foundation models","author":"Yu J.","year":"2022","unstructured":"J. Yu, Z. Wang, V. Vasudevan, L. Yeung, M. Seyedhosseini, and Y. Wu. Coca: Contrastive captioners are image-text foundation models. Transactions on Machine Learning Research, Aug 2022, 2022.","journal-title":"Transactions on Machine Learning Research"},{"key":"e_1_3_2_2_46_1","volume-title":"Florence: A new foundation model for computer vision","author":"Yuan L.","year":"2021","unstructured":"L. Yuan, D. Chen, Y.-L. Chen, N. Codella, X. Dai, J. Gao, H. Hu, X. Huang, B. Li, C. Li, C. Liu, M. Liu, Z. Liu, Y. Lu, Y. Shi, L. Wang, J. Wang, B. Xiao, Z. Xiao, J. Yang, M. Zeng, L. Zhou, and P. Zhang. Florence: A new foundation model for computer vision, 2021."},{"key":"e_1_3_2_2_47_1","volume-title":"British Machine Vision Conference (BMVC)","author":"Zhai A.","year":"2019","unstructured":"A. Zhai and H.-Y. Wu. Classification is a strong baseline for deep metric learning. British Machine Vision Conference (BMVC), 2019."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330739"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3357834"}],"event":{"name":"KDD '24: The 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"],"location":"Barcelona Spain","acronym":"KDD '24"},"container-title":["Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3637528.3671640","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T13:40:53Z","timestamp":1730986853000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671640"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,24]]},"references-count":50,"alternative-id":["10.1145\/3637528.3671640","10.1145\/3637528"],"URL":"https:\/\/doi.org\/10.1145\/3637528.3671640","relation":{},"subject":[],"published":{"date-parts":[[2024,8,24]]},"assertion":[{"value":"2024-08-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}