{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,12,8]],"date-time":"2024-12-08T05:07:55Z","timestamp":1733634475071,"version":"3.30.1"},"reference-count":43,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2025,4,1]],"date-time":"2025-04-01T00:00:00Z","timestamp":1743465600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,4,1]],"date-time":"2025-04-01T00:00:00Z","timestamp":1743465600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T00:00:00Z","timestamp":1732492800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1016\/j.patcog.2024.111189","type":"journal-article","created":{"date-parts":[[2024,11,17]],"date-time":"2024-11-17T20:38:07Z","timestamp":1731875887000},"page":"111189","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Percept, Chat, Adapt: Knowledge transfer of foundation models for open-world video recognition"],"prefix":"10.1016","volume":"160","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-2419-7517","authenticated-orcid":false,"given":"Boyu","family":"Chen","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6783-3270","authenticated-orcid":false,"given":"Siran","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Kunchang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Qinglin","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Qiao","sequence":"additional","affiliation":[]},{"given":"Yali","family":"Wang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"year":"2022","series-title":"Uniformerv2: Spatiotemporal learning by arming image vits with video uniformer","author":"Li","key":"10.1016\/j.patcog.2024.111189_b1"},{"key":"10.1016\/j.patcog.2024.111189_b2","article-title":"Multi-grained clip focus for skeleton-based action recognition","author":"Qiu","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2024.111189_b3","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109905","article-title":"Relative-position embedding based spatially and temporally decoupled transformer for action recognition","volume":"145","author":"Ma","year":"2024","journal-title":"Pattern Recognit."},{"year":"2023","series-title":"Llama: Open and efficient foundation language models","author":"Touvron","key":"10.1016\/j.patcog.2024.111189_b4"},{"year":"2022","series-title":"Introducing ChatGPT","author":"OpenAI","key":"10.1016\/j.patcog.2024.111189_b5"},{"year":"2023","series-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","author":"Chiang","key":"10.1016\/j.patcog.2024.111189_b6"},{"key":"10.1016\/j.patcog.2024.111189_b7","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"year":"2023","series-title":"Videochat: Chat-centric video understanding","author":"Li","key":"10.1016\/j.patcog.2024.111189_b8"},{"key":"10.1016\/j.patcog.2024.111189_b9","series-title":"International Conference on Machine Learning","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"year":"2022","series-title":"Internvideo: General video foundation models via generative and discriminative learning","author":"Wang","key":"10.1016\/j.patcog.2024.111189_b10"},{"year":"2023","series-title":"Visual instruction tuning","author":"Liu","key":"10.1016\/j.patcog.2024.111189_b11"},{"key":"10.1016\/j.patcog.2024.111189_b12","first-page":"23716","article-title":"Flamingo: a visual language model for few-shot learning","volume":"35","author":"Alayrac","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2024.111189_b13","doi-asserted-by":"crossref","unstructured":"K.C. Chan, S. Zhou, X. Xu, C.C. Loy, Investigating tradeoffs in real-world video super-resolution, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 5962\u20135971.","DOI":"10.1109\/CVPR52688.2022.00587"},{"key":"10.1016\/j.patcog.2024.111189_b14","article-title":"Uniformer: Unifying convolution and self-attention for visual recognition","author":"Li","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2024.111189_b15","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2024.111189_b16","series-title":"2020 25th International Conference on Pattern Recognition","first-page":"7387","article-title":"Tinyvirat: Low-resolution video action recognition","author":"Demir","year":"2021"},{"key":"10.1016\/j.patcog.2024.111189_b17","series-title":"Deep Learning for Human Activity Recognition: Second International Workshop, DL-HAR 2020, Kyoto, Japan, January 8, 2021, Proceedings 2","first-page":"70","article-title":"Arid: A new dataset for recognizing action in the dark","author":"Xu","year":"2021"},{"key":"10.1016\/j.patcog.2024.111189_b18","series-title":"2022 26th International Conference on Pattern Recognition","first-page":"4967","article-title":"VideoPipe 2022 challenge: Real-world video understanding for urban pipe inspection","author":"Liu","year":"2022"},{"key":"10.1016\/j.patcog.2024.111189_b19","doi-asserted-by":"crossref","unstructured":"A. Arnab, M. Dehghani, G. Heigold, C. Sun, M. Lu\u010di\u0107, C. Schmid, Vivit: A video vision transformer, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 6836\u20136846.","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"10.1016\/j.patcog.2024.111189_b20","unstructured":"G. Bertasius, H. Wang, L. Torresani, Is space-time attention all you need for video understanding?, in: ICML, Vol. 2, No. 3, 2021, p. 4."},{"year":"2021","series-title":"Actionclip: A new paradigm for video action recognition","author":"Wang","key":"10.1016\/j.patcog.2024.111189_b21"},{"year":"2017","series-title":"The kinetics human action video dataset","author":"Kay","key":"10.1016\/j.patcog.2024.111189_b22"},{"key":"10.1016\/j.patcog.2024.111189_b23","doi-asserted-by":"crossref","unstructured":"R. Goyal, S. Ebrahimi Kahou, V. Michalski, J. Materzynska, et al., The\u201d something something\u201d video database for learning and evaluating visual common sense, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 5842\u20135850.","DOI":"10.1109\/ICCV.2017.622"},{"key":"10.1016\/j.patcog.2024.111189_b24","article-title":"Exploring rich semantics for open-set action recognition","author":"Hu","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2024.111189_b25","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"16351","article-title":"Evidential active recognition: Intelligent and prudent open-world embodied perception","author":"Fan","year":"2024"},{"key":"10.1016\/j.patcog.2024.111189_b26","doi-asserted-by":"crossref","DOI":"10.1109\/TPAMI.2023.3311447","article-title":"Vectorized evidential learning for weakly-supervised temporal action localization","author":"Gao","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2024.111189_b27","doi-asserted-by":"crossref","unstructured":"R. Chen, J. Chen, Z. Liang, H. Gao, S. Lin, Darklight networks for action recognition in the dark, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 846\u2013852.","DOI":"10.1109\/CVPRW53098.2021.00094"},{"key":"10.1016\/j.patcog.2024.111189_b28","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2024.111189_b29","doi-asserted-by":"crossref","unstructured":"W. Wang, H. Bao, L. Dong, J. Bjorck, Z. Peng, et al., Image as a Foreign Language: BEiT Pretraining for Vision and Vision-Language Tasks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 19175\u201319186.","DOI":"10.1109\/CVPR52729.2023.01838"},{"year":"2020","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","key":"10.1016\/j.patcog.2024.111189_b30"},{"key":"10.1016\/j.patcog.2024.111189_b31","first-page":"26462","article-title":"St-adapter: Parameter-efficient image-to-video transfer learning","volume":"35","author":"Pan","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"9","key":"10.1016\/j.patcog.2024.111189_b32","doi-asserted-by":"crossref","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","article-title":"Learning to prompt for vision-language models","volume":"130","author":"Zhou","year":"2022","journal-title":"Int. J. Comput. Vis."},{"year":"2021","series-title":"Clip-adapter: Better vision-language models with feature adapters","author":"Gao","key":"10.1016\/j.patcog.2024.111189_b33"},{"key":"10.1016\/j.patcog.2024.111189_b34","series-title":"European Conference on Computer Vision","first-page":"388","article-title":"Frozen clip models are efficient video learners","author":"Lin","year":"2022"},{"year":"2023","series-title":"Segment anything","author":"Kirillov","key":"10.1016\/j.patcog.2024.111189_b35"},{"key":"10.1016\/j.patcog.2024.111189_b36","doi-asserted-by":"crossref","unstructured":"D. Tran, H. Wang, L. Torresani, J. Ray, Y. LeCun, M. Paluri, A closer look at spatiotemporal convolutions for action recognition, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 6450\u20136459.","DOI":"10.1109\/CVPR.2018.00675"},{"year":"2018","series-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","key":"10.1016\/j.patcog.2024.111189_b37"},{"key":"10.1016\/j.patcog.2024.111189_b38","doi-asserted-by":"crossref","unstructured":"D. Ghadiyaram, D. Tran, D. Mahajan, Large-scale weakly-supervised pre-training for video action recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 12046\u201312055.","DOI":"10.1109\/CVPR.2019.01232"},{"year":"2017","series-title":"Decoupled weight decay regularization","author":"Loshchilov","key":"10.1016\/j.patcog.2024.111189_b39"},{"year":"2005","series-title":"Digital image processing","author":"J\u00e4hne","key":"10.1016\/j.patcog.2024.111189_b40"},{"key":"10.1016\/j.patcog.2024.111189_b41","doi-asserted-by":"crossref","unstructured":"K. He, X. Chen, S. Xie, Y. Li, P. Doll\u00e1r, R. Girshick, Masked autoencoders are scalable vision learners, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 16000\u201316009.","DOI":"10.1109\/CVPR52688.2022.01553"},{"year":"2023","series-title":"Unmasked teacher: Towards training-efficient video foundation models","author":"Li","key":"10.1016\/j.patcog.2024.111189_b42"},{"year":"2016","series-title":"Grad-CAM: Why did you say that? Visual explanations from deep networks via gradient-based localization, CoRR","author":"Selvaraju","key":"10.1016\/j.patcog.2024.111189_b43"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320324009403?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320324009403?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T09:00:25Z","timestamp":1733562025000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320324009403"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4]]},"references-count":43,"alternative-id":["S0031320324009403"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2024.111189","relation":{},"ISSN":["0031-3203"],"issn-type":[{"type":"print","value":"0031-3203"}],"subject":[],"published":{"date-parts":[[2025,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Percept, Chat, Adapt: Knowledge transfer of foundation models for open-world video recognition","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2024.111189","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 The Authors. Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"111189"}}