{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T05:27:58Z","timestamp":1732339678759,"version":"3.28.0"},"publisher-location":"Cham","reference-count":56,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729690","type":"print"},{"value":"9783031729706","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T00:00:00Z","timestamp":1732320000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T00:00:00Z","timestamp":1732320000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T00:00:00Z","timestamp":1732320000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T00:00:00Z","timestamp":1732320000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72970-6_9","type":"book-chapter","created":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T10:51:51Z","timestamp":1732272711000},"page":"143-160","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["ClearCLIP: Decomposing CLIP Representations for\u00a0Dense Vision-Language Inference"],"prefix":"10.1007","author":[{"given":"Mengcheng","family":"Lan","sequence":"first","affiliation":[]},{"given":"Chaofeng","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Yiping","family":"Ke","sequence":"additional","affiliation":[]},{"given":"Xinjiang","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Litong","family":"Feng","sequence":"additional","affiliation":[]},{"given":"Wayne","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,23]]},"reference":[{"key":"9_CR1","first-page":"25","volume":"33","author":"JB Alayrac","year":"2020","unstructured":"Alayrac, J.B., et al.: Self-supervised multimodal versatile networks. Adv. Neural. Inf. Process. Syst. 33, 25\u201337 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR2","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: Visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"9_CR3","doi-asserted-by":"crossref","unstructured":"Bousselham, W., Petersen, F., Ferrari, V., Kuehne, H.: Grounding everything: emerging localization properties in vision-language transformers. arXiv preprint arXiv:2312.00878 (2023)","DOI":"10.1109\/CVPR52733.2024.00367"},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J., Ferrari, V.: COCO-stuff: thing and stuff classes in context. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1209\u20131218 (2018)","DOI":"10.1109\/CVPR.2018.00132"},{"key":"9_CR5","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"9_CR6","doi-asserted-by":"crossref","unstructured":"Cha, J., Mun, J., Roh, B.: Learning to generate text-grounded mask for open-world semantic segmentation from only image-text pairs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11165\u201311174 (2023)","DOI":"10.1109\/CVPR52729.2023.01074"},{"key":"9_CR7","unstructured":"Chen, X., et\u00a0al.: PaLI: a jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794 (2022)"},{"key":"9_CR8","doi-asserted-by":"crossref","unstructured":"Cherti, M., et al.: Reproducible scaling laws for contrastive language-image learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2818\u20132829 (2023)","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"9_CR9","unstructured":"Cho, J., Lei, J., Tan, H., Bansal, M.: Unifying vision-and-language tasks via text generation. In: International Conference on Machine Learning, pp. 1931\u20131942. PMLR (2021)"},{"key":"9_CR10","unstructured":"Contributors, M.: MMSegmentation: OpenMMLab semantic segmentation toolbox and benchmark (2020)"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3213\u20133223 (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"9_CR12","unstructured":"Darcet, T., Oquab, M., Mairal, J., Bojanowski, P.: Vision transformers need registers. arXiv preprint arXiv:2309.16588 (2023)"},{"key":"9_CR13","unstructured":"Everingham, M., Winn, J.: The pascal visual object classes challenge 2012 (VOC2012) development kit. Pattern Anal. Stat. Model. Comput. Learn., Tech. Rep. 2007(1-45), 5 (2012)"},{"key":"9_CR14","unstructured":"Gandelsman, Y., Efros, A.A., Steinhardt, J.: Interpreting CLIP\u2019s image representation via text-based decomposition. arXiv preprint arXiv:2310.05916 (2023)"},{"key":"9_CR15","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4419-7970-4","volume-title":"Entropy and Information Theory","author":"RM Gray","year":"2011","unstructured":"Gray, R.M.: Entropy and Information Theory. Springer, Heidelberg (2011). https:\/\/doi.org\/10.1007\/978-1-4419-7970-4"},{"key":"9_CR16","unstructured":"Hamilton, M., Zhang, Z., Hariharan, B., Snavely, N., Freeman, W.T.: Unsupervised semantic segmentation by distilling feature correspondences. arXiv preprint arXiv:2203.08414 (2022)"},{"key":"9_CR17","doi-asserted-by":"crossref","unstructured":"Han, C., Zhong, Y., Li, D., Han, K., Ma, L.: Open-vocabulary semantic segmentation with decoupled one-pass network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1086\u20131096 (2023)","DOI":"10.1109\/ICCV51070.2023.00106"},{"key":"9_CR18","doi-asserted-by":"crossref","unstructured":"He, W., Jamonnak, S., Gou, L., Ren, L.: CLIP-S4: language-guided self-supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11207\u201311216 (2023)","DOI":"10.1109\/CVPR52729.2023.01078"},{"key":"9_CR19","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"9_CR20","unstructured":"Jiao, S., Wei, Y., Wang, Y., Zhao, Y., Shi, H.: Learning mask-aware CLIP representations for zero-shot segmentation. arXiv preprint arXiv:2310.00240 (2023)"},{"key":"9_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1007\/978-3-031-19833-5_38","volume-title":"Computer Vision \u2013 ECCV 2022","author":"AU Khan","year":"2022","unstructured":"Khan, A.U., Kuehne, H., Gan, C., Lobo, N.D.V., Shah, M.: Weakly supervised grounding for VQA in vision-language transformers. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13695, pp. 652\u2013670. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_38"},{"key":"9_CR22","unstructured":"Kim, W., Son, B., Kim, I.: ViLT: vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning, pp. 5583\u20135594. PMLR (2021)"},{"key":"9_CR23","unstructured":"Lan, M., Wang, X., Ke, Y., Xu, J., Feng, L., Zhang, W.: SmooSeg: smoothness prior for unsupervised semantic segmentation. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"9_CR24","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"9_CR25","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR26","unstructured":"Li, Y., Wang, H., Duan, Y., Li, X.: CLIP surgery for better explainability with enhancement in open-vocabulary tasks. arXiv preprint arXiv:2304.05653 (2023)"},{"key":"9_CR27","unstructured":"Li, Y., Li, Z., Zeng, Q., Hou, Q., Cheng, M.M.: Cascade-CLIP: cascaded vision-language embeddings alignment for zero-shot semantic segmentation. arXiv preprint arXiv:2406.00670 (2024)"},{"key":"9_CR28","doi-asserted-by":"crossref","unstructured":"Li, Z., Zhou, Q., Zhang, X., Zhang, Y., Wang, Y., Xie, W.: Open-vocabulary object segmentation with diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7667\u20137676 (2023)","DOI":"10.1109\/ICCV51070.2023.00705"},{"key":"9_CR29","doi-asserted-by":"crossref","unstructured":"Liang, F., et al.: Open-vocabulary semantic segmentation with mask-adapted CLIP. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7061\u20137070 (2023)","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"9_CR30","unstructured":"Luo, H., Bao, J., Wu, Y., He, X., Li, T.: SegCLIP: patch aggregation with learnable centers for open-vocabulary semantic segmentation. In: International Conference on Machine Learning, pp. 23033\u201323044. PMLR (2023)"},{"key":"9_CR31","doi-asserted-by":"crossref","unstructured":"Melas-Kyriazi, L., Rupprecht, C., Laina, I., Vedaldi, A.: Deep spectral methods: a surprisingly strong baseline for unsupervised semantic segmentation and localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8364\u20138375 (2022)","DOI":"10.1109\/CVPR52688.2022.00818"},{"key":"9_CR32","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9879\u20139889 (2020)","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"9_CR33","doi-asserted-by":"crossref","unstructured":"Mishra, A., Alahari, K., Jawahar, C.: Image retrieval using textual cues. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3040\u20133047 (2013)","DOI":"10.1109\/ICCV.2013.378"},{"key":"9_CR34","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., et al.: The role of context for object detection and semantic segmentation in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 891\u2013898 (2014)","DOI":"10.1109\/CVPR.2014.119"},{"key":"9_CR35","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"9_CR36","unstructured":"Ren, P., et al.: ViewCo: discovering text-supervised segmentation masks via multi-view semantic consistency. In: The Eleventh International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=2XLRBjY46O6"},{"key":"9_CR37","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR38","first-page":"33754","volume":"35","author":"G Shin","year":"2022","unstructured":"Shin, G., Xie, W., Albanie, S.: ReCo: retrieve and co-segment for zero-shot transfer. Adv. Neural. Inf. Process. Syst. 35, 33754\u201333767 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR39","doi-asserted-by":"crossref","unstructured":"Sun, S., Li, R., Torr, P., Gu, X., Li, S.: CLIP as RNN: segment countless visual concepts without training endeavor. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13171\u201313182 (2024)","DOI":"10.1109\/CVPR52733.2024.01251"},{"key":"9_CR40","doi-asserted-by":"crossref","unstructured":"Wang, F., Mei, J., Yuille, A.: SCLIP: rethinking self-attention for dense vision-language inference. arXiv preprint arXiv:2312.01597 (2023)","DOI":"10.1007\/978-3-031-72664-4_18"},{"key":"9_CR41","unstructured":"Wu, S., et al.: CLIPSelf: vision transformer distills itself for open-vocabulary dense prediction. arXiv preprint arXiv:2310.01403 (2023)"},{"key":"9_CR42","unstructured":"Xing, Y., Kang, J., Xiao, A., Nie, J., Shao, L., Lu, S.: Rewrite caption semantics: bridging semantic gaps for language-supervised semantic segmentation. In: Thirty-seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=9iafshF7s3"},{"key":"9_CR43","unstructured":"Xu, H., et al.: Demystifying clip data. arXiv preprint arXiv:2309.16671 (2023)"},{"key":"9_CR44","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: GroupViT: semantic segmentation emerges from text supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18134\u201318144 (2022)","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"9_CR45","doi-asserted-by":"crossref","unstructured":"Xu, J., Liu, S., Vahdat, A., Byeon, W., Wang, X., De\u00a0Mello, S.: Open-vocabulary panoptic segmentation with text-to-image diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2955\u20132966 (2023)","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"9_CR46","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: Learning open-vocabulary semantic segmentation models from natural language supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2935\u20132944 (2023)","DOI":"10.1109\/CVPR52729.2023.00287"},{"key":"9_CR47","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhang, Z., Wei, F., Hu, H., Bai, X.: Side adapter network for open-vocabulary semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2945\u20132954 (2023)","DOI":"10.1109\/CVPR52729.2023.00288"},{"key":"9_CR48","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"736","DOI":"10.1007\/978-3-031-19818-2_42","volume-title":"Computer Vision \u2013 ECCV 2022","author":"M Xu","year":"2022","unstructured":"Xu, M., et al.: A simple baseline for open-vocabulary semantic segmentation with pre-trained vision-language model. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13689, pp. 736\u2013753. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19818-2_42"},{"key":"9_CR49","doi-asserted-by":"crossref","unstructured":"Xu, X., Xiong, T., Ding, Z., Tu, Z.: MasQCLIP for open-vocabulary universal image segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 887\u2013898 (2023)","DOI":"10.1109\/ICCV51070.2023.00088"},{"key":"9_CR50","unstructured":"Yao, L., et al.: FILIP: fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783 (2021)"},{"key":"9_CR51","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: CoCa: contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"key":"9_CR52","unstructured":"Yu, Q., He, J., Deng, X., Shen, X., Chen, L.C.: Convolutions die hard: open-vocabulary segmentation with single frozen convolutional CLIP. arXiv preprint arXiv:2308.02487 (2023)"},{"key":"9_CR53","unstructured":"Yuan, L., et\u00a0al.: Florence: a new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)"},{"key":"9_CR54","unstructured":"Zhang, F., et al.: Uncovering prototypical knowledge for weakly open-vocabulary semantic segmentation. arXiv preprint arXiv:2310.19001 (2023)"},{"key":"9_CR55","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","volume":"127","author":"B Zhou","year":"2019","unstructured":"Zhou, B., et al.: Semantic understanding of scenes through the ADE20K dataset. Int. J. Comput. Vision 127, 302\u2013321 (2019)","journal-title":"Int. J. Comput. Vision"},{"key":"9_CR56","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"696","DOI":"10.1007\/978-3-031-19815-1_40","volume-title":"Computer Vision \u2013 ECCV 2022","author":"C Zhou","year":"2022","unstructured":"Zhou, C., Loy, C.C., Dai, B.: Extract free dense labels from CLIP. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13688, pp. 696\u2013712. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_40"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72970-6_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T11:12:42Z","timestamp":1732273962000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72970-6_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,23]]},"ISBN":["9783031729690","9783031729706"],"references-count":56,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72970-6_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,23]]},"assertion":[{"value":"23 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}