{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,11]],"date-time":"2025-04-11T15:30:31Z","timestamp":1744385431470,"version":"3.28.0"},"reference-count":63,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1109\/cvpr52729.2023.02207","type":"proceedings-article","created":{"date-parts":[[2023,8,22]],"date-time":"2023-08-22T17:30:52Z","timestamp":1692725452000},"page":"23045-23055","source":"Crossref","is-referenced-by-count":19,"title":["Towards Generalisable Video Moment Retrieval: Visual-Dynamic Injection to Image-Text Pre-Training"],"prefix":"10.1109","author":[{"given":"Dezhao","family":"Luo","sequence":"first","affiliation":[{"name":"Queen Mary University of London"}]},{"given":"Jiabo","family":"Huang","sequence":"additional","affiliation":[{"name":"Queen Mary University of London"}]},{"given":"Shaogang","family":"Gong","sequence":"additional","affiliation":[{"name":"Queen Mary University of London"}]},{"given":"Hailin","family":"Jin","sequence":"additional","affiliation":[{"name":"Adobe Research"}]},{"given":"Yang","family":"Liu","sequence":"additional","affiliation":[{"name":"Peking University,WICT"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6924"},{"key":"ref12","first-page":"2121","article-title":"A deep visual-semantic embedding model","author":"frome","year":"0","journal-title":"NeurIPS"},{"key":"ref56","article-title":"Wsabie: Scaling up to large vocabulary image annotation","author":"weston","year":"2011","journal-title":"IJCAI"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018393"},{"key":"ref59","article-title":"Florence: A new foundation model for computer vision","author":"yuan","year":"0","journal-title":"ArXiv Preprint"},{"key":"ref14","first-page":"1984","article-title":"Excl: Extractive clip localization using natural language descriptions","author":"ghosh","year":"2019","journal-title":"NAACL"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"ref53","article-title":"Actionclip: A new paradigm for video action recognition","author":"wang","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref52","first-page":"30","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"NeurIPS"},{"key":"ref11","first-page":"31","article-title":"Weakly supervised dense event captioning in videos","author":"duan","year":"2018","journal-title":"NeurIPS"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20163"},{"key":"ref10","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2021","journal-title":"ICLRE"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00042"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.318"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref19","first-page":"7199","article-title":"Cross-sentence temporal and semantic relations in video ac-tivity localisation","author":"huang","year":"2021","journal-title":"ICCV"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref46","first-page":"510","article-title":"Hollywood in homes: Crowdsourcing data collection for activity under-standing","author":"sigurdsson","year":"2016","journal-title":"ECCV"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1086\/214279"},{"key":"ref48","article-title":"Very deep convo-lutional networks for large-scale image recognition","author":"simonyan","year":"2015","journal-title":"ICLRE"},{"key":"ref47","article-title":"Very deep convo-lutional networks for large-scale image recognition","author":"simonyan","year":"2015","journal-title":"ICLRE"},{"key":"ref42","first-page":"8748","article-title":"Learning transferable visual models from natural language super-vision","author":"radford","year":"2021","journal-title":"ICML"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"ref44","article-title":"Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter","author":"sanh","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"ref49","first-page":"5198","article-title":"Reclip: A strong zero-shot baseline for referring expression compre-hension","author":"subramanian","year":"2022","journal-title":"ACL"},{"key":"ref8","first-page":"4171","article-title":"Bert: Pre-training of deep bidirectional trans-formers for language understanding","author":"devlin","year":"2019","journal-title":"NAACL"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.3390\/app11093730"},{"key":"ref4","article-title":"Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss","author":"cheng","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1015"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00444"},{"key":"ref5","first-page":"1","article-title":"Natural language processing techniques to reveal human-computer interaction for development research top-ics","author":"chiyangwa","year":"0","journal-title":"icARTi"},{"key":"ref40","first-page":"1","article-title":"Image-to-word transformation based on dividing and vector quantizing images with words","author":"mori","year":"1999","journal-title":"First international work-shop on multimedia intelligent storage and retrieval manage-ment"},{"key":"ref35","first-page":"6959","article-title":"Retrieval augmented clas-sification for long-tail visual recognition","author":"long","year":"2022","journal-title":"CVPR"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"ref36","article-title":"Decoupled weight decay regularization","author":"loshchilov","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01108"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_34"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547969"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20058"},{"key":"ref2","first-page":"38","article-title":"Locvtp: Video-text pre-training for temporal localization","author":"cao","year":"2022","journal-title":"ECCV"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"ref23","article-title":"The kinetics hu-man action video dataset","author":"kay","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref26","first-page":"2","article-title":"Handwritten digit recognition with a back-propagation network","author":"lecun","year":"1989","journal-title":"NeurIPS"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01753"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_41"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00834"},{"key":"ref22","first-page":"105","article-title":"Prompting visual-language models for efficient video understanding","author":"ju","year":"2022","journal-title":"ECCV"},{"key":"ref21","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"jia","year":"2021","journal-title":"ICML"},{"key":"ref28","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"li","year":"2021","journal-title":"NeurIPS"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00304"},{"key":"ref29","first-page":"388","article-title":"Frozen clip models are efficient video learners","author":"lin","year":"2022","journal-title":"ECCV"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01030"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"ref61","first-page":"6543","article-title":"Span-based localizing network for natural language video lo-calization","author":"zhang","year":"2020","journal-title":"ACL"}],"event":{"name":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","start":{"date-parts":[[2023,6,17]]},"location":"Vancouver, BC, Canada","end":{"date-parts":[[2023,6,24]]}},"container-title":["2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10203037\/10203050\/10204124.pdf?arnumber=10204124","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,11]],"date-time":"2023-09-11T18:05:13Z","timestamp":1694455513000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10204124\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6]]},"references-count":63,"URL":"https:\/\/doi.org\/10.1109\/cvpr52729.2023.02207","relation":{},"subject":[],"published":{"date-parts":[[2023,6]]}}}