{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T01:16:29Z","timestamp":1728177389715},"reference-count":50,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1109\/cvpr52729.2023.00633","type":"proceedings-article","created":{"date-parts":[[2023,8,22]],"date-time":"2023-08-22T17:30:52Z","timestamp":1692725452000},"source":"Crossref","is-referenced-by-count":32,"title":["Fine-tuned CLIP Models are Efficient Video Learners"],"prefix":"10.1109","author":[{"given":"Hanoona","family":"Rasheed","sequence":"first","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]},{"given":"Muhammad Uzair","family":"Khattak","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]},{"given":"Muhammad","family":"Maaz","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]},{"given":"Salman","family":"Khan","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]},{"given":"Fahad Shahbaz","family":"Khan","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of AI"}]}],"member":"263","reference":[{"key":"ref13","article-title":"All about knowledge graphs for actions","author":"ghosh","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref15","article-title":"Open-vocabulary object detection via vision and language knowledge distillation","author":"gu","year":"0","journal-title":"International Conference on Learning Representations"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref11","article-title":"De-coupling zero-shot semantic segmentation","author":"ding","year":"0","journal-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition"},{"key":"ref10","article-title":"Spatiotemporal residual networks for video action recognition","author":"christoph","year":"0","journal-title":"Advances in neural information processing systems"},{"key":"ref17","article-title":"Prompting visual-language models for efficient video understanding","author":"ju","year":"0","journal-title":"The European Conference on Computer Vision"},{"key":"ref16","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"jia","year":"2021","journal-title":"International Conference on Machine Learning"},{"key":"ref19","article-title":"The kinetics human action video dataset","author":"kay","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00983"},{"key":"ref46","doi-asserted-by":"crossref","DOI":"10.1023\/A:1026531017760","article-title":"Denseclip: Extract free dense labels from clip","author":"zhou","year":"0","journal-title":"The European Conference on Computer Vision"},{"key":"ref45","article-title":"Tip-adapter: Training-free clip-adapter for better vision-language modeling","author":"zhang","year":"0","journal-title":"The European Conference on Computer Vision"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"ref47","article-title":"Learning to prompt for vision-language models","author":"zhou","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.226"},{"key":"ref41","article-title":"Alternative semantic representations for zero-shot human action recognition","author":"wang","year":"0","journal-title":"ECML PKDD"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.321"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00333"},{"key":"ref49","doi-asserted-by":"crossref","DOI":"10.1023\/A:1026531017760","article-title":"Detecting twenty-thousand classes using image-level supervision","author":"zhou","year":"0","journal-title":"The European Conference on Computer Vision"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01338"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref9","article-title":"UNITER: UNiversal Image-TExt Representation Learning","author":"chen","year":"0","journal-title":"The European Conference on Computer Vision"},{"key":"ref4","article-title":"Is space-time attention all you need for video understanding?","author":"bertasius","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref3","article-title":"Visual prompting: Modifying pixel space to adapt pre-trained models","author":"bahng","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref6","article-title":"A short note about kinetics-600","author":"carreira","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00467"},{"key":"ref40","article-title":"Actionclip: A new paradigm for video action recognition","author":"wang","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref35","article-title":"Bridging the gap between object and image-level representations for open-vocabulary detection","author":"rasheed","year":"0","journal-title":"Advances in neural information processing systems"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"ref37","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","author":"saharia","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref36","article-title":"An embarrassingly simple approach to zero-shot learning","author":"romera-paredes","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref31","article-title":"Keeping your eye on the ball: Trajectory attention in video transformers","author":"patrick","year":"0","journal-title":"Advances in neural information processing systems"},{"key":"ref30","article-title":"Expanding language-image pretrained models for general video recognition","author":"ni","year":"0","journal-title":"The European Conference on Computer Vision"},{"key":"ref33","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.117"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298911"},{"key":"ref39","article-title":"Temporal segment networks: Towards good practices for deep action recognition","author":"wang","year":"0","journal-title":"The European Conference on Computer Vision"},{"key":"ref38","article-title":"Ucf101: A dataset of 101 human actions classes from videos in the wild","author":"soomro","year":"2012","journal-title":"ArXiv Preprint"},{"key":"ref24","article-title":"VisualBERT: A Simple and Performant Baseline for Vision and Language","author":"li","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref23","article-title":"Uniformer: Unifying convolution and self-attention for visual recognition","author":"li","year":"0","journal-title":"ICLRE"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref25","author":"li","year":"2020","journal-title":"Oscar Object-semantics aligned pre-training for vision-language tasks"},{"key":"ref20","article-title":"Maple: Multi-modal prompt learning","author":"khattak","year":"2022","journal-title":"ArXiv"},{"key":"ref22","article-title":"Language-driven semantic segmentation","author":"li","year":"0","journal-title":"International Conference on Learning Representations"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref28","article-title":"ViL-BERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks","author":"lu","year":"0","journal-title":"Advances in neural information processing systems"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref29","article-title":"Class-agnostic object detection with multi-modal transformer","author":"maaz","year":"0","journal-title":"The European Conference on Computer Vision"}],"event":{"name":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Vancouver, BC, Canada","start":{"date-parts":[[2023,6,17]]},"end":{"date-parts":[[2023,6,24]]}},"container-title":["2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10203037\/10203050\/10204690.pdf?arnumber=10204690","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,11]],"date-time":"2023-09-11T18:05:07Z","timestamp":1694455507000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10204690\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6]]},"references-count":50,"URL":"https:\/\/doi.org\/10.1109\/cvpr52729.2023.00633","relation":{},"subject":[],"published":{"date-parts":[[2023,6]]}}}