{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T02:10:03Z","timestamp":1733278203420,"version":"3.30.1"},"reference-count":85,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100000266","name":"Engineering and Physical Sciences Research Council","doi-asserted-by":"publisher","award":["EP\/T028572\/1"],"id":[{"id":"10.13039\/501100000266","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006595","name":"Unitatea Executiva pentru Finantarea Invatamantului Superior, a Cercetarii, Dezvoltarii si Inovarii","doi-asserted-by":"publisher","award":["EEA-RO-2018-0496"],"id":[{"id":"10.13039\/501100006595","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Artificial Intelligence"],"published-print":{"date-parts":[[2025,1]]},"DOI":"10.1016\/j.artint.2024.104235","type":"journal-article","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T17:53:31Z","timestamp":1730397211000},"page":"104235","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["TeachText: CrossModal text-video retrieval through generalized distillation"],"prefix":"10.1016","volume":"338","author":[{"given":"Ioana","family":"Croitoru","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3718-8196","authenticated-orcid":false,"given":"Simion-Vlad","family":"Bogolin","sequence":"additional","affiliation":[]},{"given":"Marius","family":"Leordeanu","sequence":"additional","affiliation":[]},{"given":"Hailin","family":"Jin","sequence":"additional","affiliation":[]},{"given":"Andrew","family":"Zisserman","sequence":"additional","affiliation":[]},{"given":"Yang","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1732-9198","authenticated-orcid":false,"given":"Samuel","family":"Albanie","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.artint.2024.104235_br0010","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","article-title":"Teachtext: crossmodal generalized distillation for text-video retrieval","author":"Croitoru","year":"2021"},{"author":"Wang","key":"10.1016\/j.artint.2024.104235_br0020"},{"key":"10.1016\/j.artint.2024.104235_br0030","series-title":"Proceedings of the Asian Conference on Computer Vision","article-title":"Condensed movies: story based retrieval with contextual embeddings","author":"Bain","year":"2020"},{"key":"10.1016\/j.artint.2024.104235_br0040","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Relational knowledge distillation","author":"Park","year":"2019"},{"author":"Miech","key":"10.1016\/j.artint.2024.104235_br0050"},{"author":"Liu","key":"10.1016\/j.artint.2024.104235_br0060"},{"key":"10.1016\/j.artint.2024.104235_br0070","series-title":"Proceedings of the European Conference on Computer Vision","article-title":"Multi-modal transformer for video retrieval","author":"Gabeur","year":"2020"},{"key":"10.1016\/j.artint.2024.104235_br0080","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","article-title":"Jointly modeling deep video and compositional text to bridge vision and language in a unified framework","author":"Xu","year":"2015"},{"key":"10.1016\/j.artint.2024.104235_br0090","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Video Google: a text retrieval approach to object matching in videos","author":"Sivic","year":"2003"},{"key":"10.1016\/j.artint.2024.104235_br0100","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","article-title":"Retrieving actions in movies","author":"Laptev","year":"2007"},{"key":"10.1016\/j.artint.2024.104235_br0110","series-title":"Proceedings of the ACM International Conference on Image and Video Retrieval","article-title":"Towards optimal bag-of-features for object categorization and semantic video retrieval","author":"Jiang","year":"2007"},{"key":"10.1016\/j.artint.2024.104235_br0120","series-title":"Proceedings of the 22nd ACM International Conference on Multimedia","article-title":"Easy samples first: self-paced reranking for zero-example multimedia search","author":"Jiang","year":"2014"},{"key":"10.1016\/j.artint.2024.104235_br0130","series-title":"Recognizing an Action Using Its Name: A Knowledge-Based Approach, vol. 120","first-page":"61","author":"Gan","year":"2016"},{"key":"10.1016\/j.artint.2024.104235_br0140","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","article-title":"Deck: discovering event composition knowledge from web images for zero-shot event detection and recounting in videos","author":"Gan","year":"2017"},{"key":"10.1016\/j.artint.2024.104235_br0150","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Utilizing semantic word similarity measures for video retrieval","author":"Aytar","year":"2008"},{"author":"Dong","key":"10.1016\/j.artint.2024.104235_br0160"},{"key":"10.1016\/j.artint.2024.104235_br0170","series-title":"Proceedings of the ACM International Conference on Multimedia Retrieval","article-title":"Learning joint embedding with multimodal cues for cross-modal video-text retrieval","author":"Mithun","year":"2018"},{"key":"10.1016\/j.artint.2024.104235_br0180","series-title":"Proceedings of the European Conference on Computer Vision","article-title":"A joint sequence fusion model for video question answering and retrieval","author":"Yu","year":"2018"},{"key":"10.1016\/j.artint.2024.104235_br0190","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Fine-grained action retrieval through multiple parts-of-speech embeddings","author":"Wray","year":"2019"},{"key":"10.1016\/j.artint.2024.104235_br0200","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Fine-grained video-text retrieval with hierarchical graph reasoning","author":"Chen","year":"2020"},{"key":"10.1016\/j.artint.2024.104235_br0210","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Howto100m: learning a text-video embedding by watching hundred million narrated video clips","author":"Miech","year":"2019"},{"author":"Miech","key":"10.1016\/j.artint.2024.104235_br0220"},{"author":"Korbar","key":"10.1016\/j.artint.2024.104235_br0230"},{"author":"Patrick","key":"10.1016\/j.artint.2024.104235_br0240"},{"key":"10.1016\/j.artint.2024.104235_br0250","series-title":"Multimodal Video Indexing: A Review of the State-of-the-Art, vol. 25","first-page":"5","author":"Snoek","year":"2005"},{"author":"Mikolov","key":"10.1016\/j.artint.2024.104235_br0260"},{"unstructured":"A. Radford, K. Narasimhan, T. Salimans, I. Sutskever, et al., Improving language understanding by generative pre-training, OpenAI, 2018.","key":"10.1016\/j.artint.2024.104235_br0270"},{"key":"10.1016\/j.artint.2024.104235_br0280","series-title":"Preprint","article-title":"Language models are unsupervised multitask learners","author":"Radford","year":"2019"},{"key":"10.1016\/j.artint.2024.104235_br0290","series-title":"Proceedings of the Association for Computational Linguistics","article-title":"Bert: pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.artint.2024.104235_br0300","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Language features matter: effective language representations for vision-language tasks","author":"Burns","year":"2019"},{"author":"Liu","key":"10.1016\/j.artint.2024.104235_br0310"},{"author":"Lan","key":"10.1016\/j.artint.2024.104235_br0320"},{"key":"10.1016\/j.artint.2024.104235_br0330","series-title":"Born Again Trees, vol. 1","first-page":"2","author":"Breiman","year":"1996"},{"key":"10.1016\/j.artint.2024.104235_br0340","series-title":"Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining","article-title":"Model compression","author":"Bucilu\u01ce","year":"2006"},{"author":"Hinton","key":"10.1016\/j.artint.2024.104235_br0350"},{"key":"10.1016\/j.artint.2024.104235_br0360","series-title":"Proceedings of the International Conference on Learning Representations","article-title":"Unifying distillation and privileged information","author":"Lopez-Paz","year":"2016"},{"key":"10.1016\/j.artint.2024.104235_br0370","series-title":"A New Learning Paradigm: Learning Using Privileged Information, vol. 22","first-page":"544","author":"Vapnik","year":"2009"},{"key":"10.1016\/j.artint.2024.104235_br0380","first-page":"2023","article-title":"Learning using privileged information: similarity control and knowledge transfer","volume":"16","author":"Vapnik","year":"2015"},{"author":"Romero","key":"10.1016\/j.artint.2024.104235_br0390"},{"author":"Zagoruyko","key":"10.1016\/j.artint.2024.104235_br0400"},{"author":"Huang","key":"10.1016\/j.artint.2024.104235_br0410"},{"key":"10.1016\/j.artint.2024.104235_br0420","first-page":"5011","article-title":"Hoi analysis: integrating and decomposing human-object interaction","volume":"vol. 33","author":"Li","year":"2020"},{"key":"10.1016\/j.artint.2024.104235_br0430","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Deep visual-semantic quantization for efficient image retrieval","author":"Cao","year":"2017"},{"key":"10.1016\/j.artint.2024.104235_br0440","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Describing like humans: on diversity in image captioning","author":"Wang","year":"2019"},{"key":"10.1016\/j.artint.2024.104235_br0450","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Weakly-supervised alignment of video with text","author":"Bojanowski","year":"2015"},{"key":"10.1016\/j.artint.2024.104235_br0460","series-title":"Grounded Compositional Semantics for Finding and Describing Images with Sentences, vol. 2","first-page":"207","author":"Socher","year":"2014"},{"key":"10.1016\/j.artint.2024.104235_br0470","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Quo vadis, action recognition? A new model and the kinetics dataset","author":"Carreira","year":"2017"},{"key":"10.1016\/j.artint.2024.104235_br0480","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"A closer look at spatiotemporal convolutions for action recognition","author":"Tran","year":"2018"},{"key":"10.1016\/j.artint.2024.104235_br0490","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Large-scale weakly-supervised pre-training for video action recognition","author":"Ghadiyaram","year":"2019"},{"key":"10.1016\/j.artint.2024.104235_br0500","series-title":"Advances in Neural Information Processing Systems","article-title":"Pytorch: an imperative style, high-performance deep learning library","author":"Paszke","year":"2019"},{"author":"Kingma","key":"10.1016\/j.artint.2024.104235_br0510"},{"key":"10.1016\/j.artint.2024.104235_br0520","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Msr-vtt: a large video description dataset for bridging video and language","author":"Xu","year":"2016"},{"key":"10.1016\/j.artint.2024.104235_br0530","series-title":"Proceedings of the Association for Computational Linguistics","article-title":"Collecting highly parallel data for paraphrase evaluation","author":"Chen","year":"2011"},{"key":"10.1016\/j.artint.2024.104235_br0540","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Sequence to sequence-video to text","author":"Venugopalan","year":"2015"},{"key":"10.1016\/j.artint.2024.104235_br0550","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Localizing moments in video with natural language","author":"Anne Hendricks","year":"2017"},{"key":"10.1016\/j.artint.2024.104235_br0560","series-title":"Yfcc100m: The New Data in Multimedia Research, vol. 59","first-page":"64","author":"Thomee","year":"2016"},{"key":"10.1016\/j.artint.2024.104235_br0570","series-title":"Proceedings of the European Conference on Computer Vision","article-title":"Cross-modal and hierarchical modeling of video and text","author":"Zhang","year":"2018"},{"key":"10.1016\/j.artint.2024.104235_br0580","series-title":"Movie Description, vol. 123","first-page":"94","author":"Rohrbach","year":"2017"},{"key":"10.1016\/j.artint.2024.104235_br0590","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Activitynet: a large-scale video benchmark for human activity understanding","author":"Caba Heilbron","year":"2015"},{"key":"10.1016\/j.artint.2024.104235_br0600","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Vatex: a large-scale, high-quality multilingual dataset for video-and-language research","author":"Wang","year":"2019"},{"author":"Oncescu","key":"10.1016\/j.artint.2024.104235_br0610"},{"doi-asserted-by":"crossref","unstructured":"J. Hu, L. Shen, S. Albanie, G. Sun, E. Wu, 2019, Squeeze-and-excitation networks.","key":"10.1016\/j.artint.2024.104235_br0620","DOI":"10.1109\/CVPR.2018.00745"},{"key":"10.1016\/j.artint.2024.104235_br0630","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Aggregated residual transformations for deep neural networks","author":"Xie","year":"2017"},{"key":"10.1016\/j.artint.2024.104235_br0640","series-title":"Proceedings of the European Conference on Computer Vision","article-title":"Exploring the limits of weakly supervised pretraining","author":"Mahajan","year":"2018"},{"key":"10.1016\/j.artint.2024.104235_br0650","series-title":"International Conference on Acoustics, Speech and Signal Processing (ICASSP)","article-title":"Cnn architectures for large-scale audio classification","author":"Hershey","year":"2017"},{"doi-asserted-by":"crossref","unstructured":"B. Zhou, A. Lapedriza, A. Khosla, A. Oliva, A. Torralba, Places: A 10 million image database for scene recognition, 2017.","key":"10.1016\/j.artint.2024.104235_br0660","DOI":"10.1167\/17.10.296"},{"key":"10.1016\/j.artint.2024.104235_br0670","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Densely connected convolutional networks","author":"Huang","year":"2017"},{"key":"10.1016\/j.artint.2024.104235_br0680","series-title":"Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations, vol. 123","first-page":"32","author":"Krishna","year":"2017"},{"key":"10.1016\/j.artint.2024.104235_br0690","series-title":"Advances in Neural Information Processing Systems","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"author":"F. team","key":"10.1016\/j.artint.2024.104235_br0700"},{"key":"10.1016\/j.artint.2024.104235_br0710","series-title":"Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining","article-title":"Ranking distillation: learning compact ranking models with high performance for recommender system","author":"Tang","year":"2018"},{"key":"10.1016\/j.artint.2024.104235_br0720","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","article-title":"Domain agnostic real-valued specificity prediction","author":"Ko","year":"2019"},{"key":"10.1016\/j.artint.2024.104235_br0730","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Cross modal retrieval with querybank normalisation","author":"Bogolin","year":"2022"},{"author":"Fang","key":"10.1016\/j.artint.2024.104235_br0740"},{"key":"10.1016\/j.artint.2024.104235_br0750","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Dual dense encoding for zero-example video retrieval","author":"Dong","year":"2019"},{"key":"10.1016\/j.artint.2024.104235_br0760","series-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","article-title":"Masking modalities for cross-modal video retrieval","author":"Gabeur","year":"2022"},{"key":"10.1016\/j.artint.2024.104235_br0770","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","article-title":"Frozen in time: a joint video and image encoder for end-to-end retrieval","author":"Bain","year":"2021"},{"key":"10.1016\/j.artint.2024.104235_br0780","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Advancing high-resolution video-language representation with large-scale video transcriptions","author":"Xue","year":"2022"},{"key":"10.1016\/j.artint.2024.104235_br0790","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Bridging video-text retrieval with multiple choice questions","author":"Ge","year":"2022"},{"key":"10.1016\/j.artint.2024.104235_br0800","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Object-aware video-language pre-training for retrieval","author":"Wang","year":"2022"},{"author":"Luo","key":"10.1016\/j.artint.2024.104235_br0810"},{"author":"Faghri","key":"10.1016\/j.artint.2024.104235_br0820"},{"author":"Venugopalan","key":"10.1016\/j.artint.2024.104235_br0830"},{"key":"10.1016\/j.artint.2024.104235_br0840","series-title":"Advances in Neural Information Processing Systems","article-title":"Lookahead optimizer: k steps forward, 1 step back","author":"Zhang","year":"2019"},{"author":"Kiros","key":"10.1016\/j.artint.2024.104235_br0850"}],"container-title":["Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0004370224001711?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0004370224001711?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T01:46:06Z","timestamp":1733276766000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0004370224001711"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1]]},"references-count":85,"alternative-id":["S0004370224001711"],"URL":"https:\/\/doi.org\/10.1016\/j.artint.2024.104235","relation":{},"ISSN":["0004-3702"],"issn-type":[{"type":"print","value":"0004-3702"}],"subject":[],"published":{"date-parts":[[2025,1]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"TeachText: CrossModal text-video retrieval through generalized distillation","name":"articletitle","label":"Article Title"},{"value":"Artificial Intelligence","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.artint.2024.104235","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"104235"}}