{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T01:47:13Z","timestamp":1740102433572,"version":"3.37.3"},"publisher-location":"New York, NY, USA","reference-count":21,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,12,7]]},"DOI":"10.1145\/3628797.3629011","type":"proceedings-article","created":{"date-parts":[[2023,12,6]],"date-time":"2023-12-06T20:25:34Z","timestamp":1701894334000},"page":"972-979","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Enhancing Video Retrieval with Robust CLIP-Based Multimodal System"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-0972-4109","authenticated-orcid":false,"given":"Minh-Dung","family":"Le-Quynh","sequence":"first","affiliation":[{"name":"Lazada Vietnam, Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8382-1206","authenticated-orcid":false,"given":"Anh-Tuan","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Science, Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0209-9288","authenticated-orcid":false,"given":"Anh-Tuan","family":"Quang-Hoang","sequence":"additional","affiliation":[{"name":"Ford Motor, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1374-5236","authenticated-orcid":false,"given":"Van-Huy","family":"Dinh","sequence":"additional","affiliation":[{"name":"HUTECH University, Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0196-6083","authenticated-orcid":false,"given":"Tien-Huy","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Information Technology, Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2290-1187","authenticated-orcid":false,"given":"Hoang-Bach","family":"Ngo","sequence":"additional","affiliation":[{"name":"University of Science, Viet Nam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0394-4731","authenticated-orcid":false,"given":"Minh-Hung","family":"An","sequence":"additional","affiliation":[{"name":"FPT Telecom, Vietnam"}]}],"member":"320","published-online":{"date-parts":[[2023,12,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.14569\/IJACSA.2021.0120776"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00980"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372278.3391927"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1080\/14015430802657216"},{"key":"e_1_3_2_1_6_1","unstructured":"Maksim Kuprashevich and Irina Tolstykh. 2023. MiVOLO: Multi-input Transformer for Age and Gender Estimation. (2023). arXiv:arXiv:2307.04616"},{"key":"e_1_3_2_1_7_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/JAS.2019.1911693"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.247"},{"key":"e_1_3_2_1_10_1","volume-title":"Video Search with CLIP and Interactive Text Query Reformulation. In International Conference on Multimedia Modeling. Springer, 628\u2013633","author":"Loko\u010d Jakub","year":"2023","unstructured":"Jakub Loko\u010d, Zuzana Vop\u00e1lkov\u00e1, Patrik Dokoupil, and Ladislav Pe\u0161ka. 2023. Video Search with CLIP and Interactive Text Query Reformulation. In International Conference on Multimedia Modeling. Springer, 628\u2013633."},{"key":"e_1_3_2_1_11_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In Advances in Neural Information Processing Systems. 13\u201323.","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In Advances in Neural Information Processing Systems. 13\u201323."},{"key":"e_1_3_2_1_12_1","volume-title":"International journal of recent technology and engineering (IJRTE) 2, 1","author":"Mithe Ravina","year":"2013","unstructured":"Ravina Mithe, Supriya Indalkar, and Nilam Divekar. 2013. Optical character recognition. International journal of recent technology and engineering (IJRTE) 2, 1 (2013), 72\u201375."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2016.7532801"},{"key":"e_1_3_2_1_14_1","volume-title":"International conference on machine learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19839-7_15"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.5402\/2012\/376804"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW.2015.7169816"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_19"},{"key":"e_1_3_2_1_20_1","volume-title":"How transferable are features in deep neural networks?Advances in neural information processing systems 27","author":"Yosinski Jason","year":"2014","unstructured":"Jason Yosinski, Jeff Clune, Yoshua Bengio, and Hod Lipson. 2014. How transferable are features in deep neural networks?Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_2_1_21_1","volume-title":"Proc. NIPS Workshop on Deep Learning and Unsupervised Feature Learning. sn.","author":"Yu Dong","year":"2010","unstructured":"Dong Yu, Li Deng, and George Dahl. 2010. Roles of pre-training and fine-tuning in context-dependent DBN-HMMs for real-world speech recognition. In Proc. NIPS Workshop on Deep Learning and Unsupervised Feature Learning. sn."}],"event":{"name":"SOICT 2023: The 12th International Symposium on Information and Communication Technology","acronym":"SOICT 2023","location":"Ho Chi Minh Vietnam"},"container-title":["Proceedings of the 12th International Symposium on Information and Communication Technology"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3628797.3629011","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,5]],"date-time":"2024-03-05T22:17:08Z","timestamp":1709677028000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3628797.3629011"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,7]]},"references-count":21,"alternative-id":["10.1145\/3628797.3629011","10.1145\/3628797"],"URL":"https:\/\/doi.org\/10.1145\/3628797.3629011","relation":{},"subject":[],"published":{"date-parts":[[2023,12,7]]},"assertion":[{"value":"2023-12-07","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}