{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,19]],"date-time":"2024-11-19T18:47:34Z","timestamp":1732042054644},"publisher-location":"Cham","reference-count":84,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031200793"},{"type":"electronic","value":"9783031200809"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-20080-9_30","type":"book-chapter","created":{"date-parts":[[2022,11,2]],"date-time":"2022-11-02T19:59:12Z","timestamp":1667419152000},"page":"512-531","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":22,"title":["Class-Agnostic Object Detection with\u00a0Multi-modal Transformer"],"prefix":"10.1007","author":[{"given":"Muhammad","family":"Maaz","sequence":"first","affiliation":[]},{"given":"Hanoona","family":"Rasheed","sequence":"additional","affiliation":[]},{"given":"Salman","family":"Khan","sequence":"additional","affiliation":[]},{"given":"Fahad Shahbaz","family":"Khan","sequence":"additional","affiliation":[]},{"given":"Rao Muhammad","family":"Anwer","sequence":"additional","affiliation":[]},{"given":"Ming-Hsuan","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,11,3]]},"reference":[{"key":"30_CR1","doi-asserted-by":"crossref","unstructured":"Alexe, B., Deselaers, T., Ferrari, V.: What is an object? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 73\u201380. IEEE (2010)","DOI":"10.1109\/CVPR.2010.5540226"},{"issue":"11","key":"30_CR2","doi-asserted-by":"publisher","first-page":"2189","DOI":"10.1109\/TPAMI.2012.28","volume":"34","author":"B Alexe","year":"2012","unstructured":"Alexe, B., Deselaers, T., Ferrari, V.: Measuring the objectness of image windows. IEEE Trans. Pattern Anal. Mach. Intell. 34(11), 2189\u20132202 (2012)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"30_CR3","doi-asserted-by":"crossref","unstructured":"Bar, A.,et al.: DETReg: unsupervised Pretraining with region priors for object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01420"},{"key":"30_CR4","doi-asserted-by":"crossref","unstructured":"Bendale, A., Boult, T.: Towards open world recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1893\u20131902 (2015)","DOI":"10.1109\/CVPR.2015.7298799"},{"key":"30_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"30_CR6","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. In: Advances in Neural Information Processing Systems (2020)"},{"key":"30_CR7","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. arXiv preprint arXiv:2104.14294 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"30_CR8","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"30_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"YC Chen","year":"2020","unstructured":"Chen, Y.C., et al.: UNITER: UNiversal image-TExt representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"30_CR10","doi-asserted-by":"crossref","unstructured":"Cheng, M.M., Zhang, Z., Lin, W.Y., Torr, P.: BING: binarized normed gradients for objectness estimation at 300fps. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3286\u20133293 (2014)","DOI":"10.1109\/CVPR.2014.414"},{"key":"30_CR11","doi-asserted-by":"crossref","unstructured":"Dai, Z., Cai, B., Lin, Y., Chen, J.: UP-DETR: unsupervised pre-training for object detection with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1601\u20131610 (2021)","DOI":"10.1109\/CVPR46437.2021.00165"},{"key":"30_CR12","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL (2019)"},{"key":"30_CR13","doi-asserted-by":"crossref","unstructured":"Dhamija, A., Gunther, M., Ventura, J., Boult, T.: The overlooked elephant of object detection: open set. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1021\u20131030 (2020)","DOI":"10.1109\/WACV45572.2020.9093355"},{"issue":"2","key":"30_CR14","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C.K., Winn, J., Zisserman, A.: The pascal visual object classes (VOC) challenge. Int. J. Comput. Vision 88(2), 303\u2013338 (2010). https:\/\/doi.org\/10.1007\/s11263-009-0275-4","journal-title":"Int. J. Comput. Vision"},{"key":"30_CR15","doi-asserted-by":"crossref","first-page":"6024","DOI":"10.1109\/TPAMI.2021.3085766","volume":"44","author":"DP Fan","year":"2021","unstructured":"Fan, D.P., Ji, G.P., Cheng, M.M., Shao, L.: Concealed object detection. IEEE Trans. Pattern Anal. Mach. Intell. 44, 6024\u20136042 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"30_CR16","doi-asserted-by":"crossref","unstructured":"Fan, D.P., Ji, G.P., Sun, G., Cheng, M.M., Shen, J., Shao, L.: Camouflaged object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2777\u20132787 (2020)","DOI":"10.1109\/CVPR42600.2020.00285"},{"key":"30_CR17","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., Urtasun, R.: Are we ready for autonomous driving? The KITTI vision benchmark suite. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3354\u20133361. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"30_CR18","doi-asserted-by":"crossref","unstructured":"Georgakis, G., Reza, M.A., Mousavian, A., Le, P.H., Ko\u0161eck\u00e1, J.: multiview RGB-D dataset for object instance detection. In: CoRR, pp. 426\u2013434. IEEE (2016)","DOI":"10.1109\/3DV.2016.52"},{"key":"30_CR19","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: LVIS: a dataset for large vocabulary instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5356\u20135364 (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"30_CR20","unstructured":"Gupta, T., Kamath, A., Kembhavi, A., Hoiem, D.: Towards general purpose vision systems. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16399\u201316409 (2022)"},{"key":"30_CR21","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"30_CR22","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"30_CR23","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"30_CR24","unstructured":"Honnibal, M., Montani, I.: spaCy: industrial-strength natural language processing in python (2020)"},{"key":"30_CR25","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"30_CR26","doi-asserted-by":"crossref","unstructured":"Inoue, N., Furuta, R., Yamasaki, T., Aizawa, K.: Cross-domain weakly-supervised object detection through progressive domain adaptation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5001\u20135009 (2018)","DOI":"10.1109\/CVPR.2018.00525"},{"key":"30_CR27","doi-asserted-by":"crossref","unstructured":"Jaiswal, A., Wu, Y., Natarajan, P., Natarajan, P.: Class-agnostic object detection. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. pp. 919\u2013928 (2021)","DOI":"10.1109\/WACV48630.2021.00096"},{"key":"30_CR28","doi-asserted-by":"crossref","unstructured":"Joseph, K., Khan, S., Khan, F.S., Balasubramanian, V.N.: Towards open world object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5830\u20135840 (2021)","DOI":"10.1109\/CVPR46437.2021.00577"},{"key":"30_CR29","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., Carion, N.: MDETR-modulated detection for end-to-end multi-modal understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 1780\u20131790 (2021)","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"30_CR30","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: Referitgame: Referring to objects in photographs of natural scenes. In: Conference on Empirical Methods in Natural Language Processing"},{"key":"30_CR31","unstructured":"Kim, D., Lin, T.Y., Angelova, A., Kweon, I.S., Kuo, W.: Learning open-world object proposals without learning to classify. ar0Xiv preprint arXiv:2108.06753 (2021)"},{"issue":"1","key":"30_CR32","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/S11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123(1), 32\u201373 (2017). https:\/\/doi.org\/10.1007\/S11263-016-0981-7","journal-title":"Int. J. Comput. Vision"},{"key":"30_CR33","doi-asserted-by":"crossref","unstructured":"Kuo, W., Hariharan, B., Malik, J.: DeepBox: learning objectness with convolutional networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition,pp. 2479\u20132487 (2015)","DOI":"10.1109\/ICCV.2015.285"},{"issue":"7","key":"30_CR34","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","volume":"128","author":"A Kuznetsova","year":"2020","unstructured":"Kuznetsova, A.: The open images dataset v4. IJCV 128(7), 1956\u20131981 (2020). https:\/\/doi.org\/10.1007\/s11263-020-01316-z","journal-title":"IJCV"},{"key":"30_CR35","doi-asserted-by":"publisher","first-page":"45","DOI":"10.1016\/j.cviu.2019.04.006","volume":"184","author":"TN Le","year":"2019","unstructured":"Le, T.N., Nguyen, T.V., Nie, Z., Tran, M.T., Sugimoto, A.: Anabranch network for camouflaged object segmentation. Comput. Vis. Image Underst. 184, 45\u201356 (2019)","journal-title":"Comput. Vis. Image Underst."},{"key":"30_CR36","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"30_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"30_CR38","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition,pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"30_CR39","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"30_CR40","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"30_CR41","doi-asserted-by":"crossref","unstructured":"Liu, J.J., Hou, Q., Cheng, M.M., Feng, J., Jiang, J.: A simple pooling-based design for real-time salient object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3917\u20133926 (2019)","DOI":"10.1109\/CVPR.2019.00404"},{"issue":"2","key":"30_CR42","doi-asserted-by":"publisher","first-page":"261","DOI":"10.1007\/s11263-019-01247-4","volume":"128","author":"L Liu","year":"2020","unstructured":"Liu, L., et al.: Deep learning for generic object detection: a survey. Int. J. Comput. Vision 128(2), 261\u2013318 (2020). https:\/\/doi.org\/10.1007\/s11263-019-01247-4","journal-title":"Int. J. Comput. Vision"},{"key":"30_CR43","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized BERT pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"30_CR44","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems (2019)"},{"key":"30_CR45","doi-asserted-by":"crossref","unstructured":"Lu, J., Goswami, V., Rohrbach, M., Parikh, D., Lee, S.: 12-in-1: multi-task vision and language representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10437\u201310446 (2020)","DOI":"10.1109\/CVPR42600.2020.01045"},{"key":"30_CR46","doi-asserted-by":"crossref","unstructured":"Misra, I., Maaten, L.V.D.: Self-supervised learning of pretext-invariant representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6707\u20136717 (2020)","DOI":"10.1109\/CVPR42600.2020.00674"},{"issue":"12","key":"30_CR47","doi-asserted-by":"publisher","first-page":"520","DOI":"10.1016\/j.tics.2007.09.009","volume":"11","author":"A Oliva","year":"2007","unstructured":"Oliva, A., Torralba, A.: The role of context in object recognition. Trends Cogn. Sci. 11(12), 520\u2013527 (2007)","journal-title":"Trends Cogn. Sci."},{"key":"30_CR48","doi-asserted-by":"crossref","unstructured":"Peyr\u00e9, G., Cuturi, M.: Computational Optimal Transport (2020)","DOI":"10.1561\/9781680835519"},{"key":"30_CR49","unstructured":"Pinheiro, P.O., Collobert, R., Doll\u00e1r, P.: Learning to segment object candidates. In: Advances in Neural Information Processing Systems (2015)"},{"key":"30_CR50","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"75","DOI":"10.1007\/978-3-319-46448-0_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"PO Pinheiro","year":"2016","unstructured":"Pinheiro, P.O., Lin, T.-Y., Collobert, R., Doll\u00e1r, P.: Learning to refine object segments. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 75\u201391. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_5"},{"key":"30_CR51","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"issue":"1","key":"30_CR52","doi-asserted-by":"publisher","first-page":"128","DOI":"10.1109\/TPAMI.2016.2537320","volume":"39","author":"J Pont-Tuset","year":"2016","unstructured":"Pont-Tuset, J., Arbelaez, P., Barron, J.T., Marques, F., Malik, J.: Multiscale combinatorial grouping for image segmentation and object proposal generation. IEEE Trans. Pattern Anal. Mach. Intell. 39(1), 128\u2013140 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"30_CR53","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (2021)"},{"key":"30_CR54","first-page":"91","volume":"28","author":"S Ren","year":"2015","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. Adv. Neural. Inf. Process. Syst. 28, 91\u201399 (2015)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"3","key":"30_CR55","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: ImageNet large scale visual recognition challenge. Int. J. Comput. Vision 115(3), 211\u2013252 (2015). https:\/\/doi.org\/10.1007\/s11263-015-0816-y","journal-title":"Int. J. Comput. Vision"},{"key":"30_CR56","doi-asserted-by":"crossref","unstructured":"Shao, S., et al.: Objects365: a large-scale, high-quality dataset for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8430\u20138439 (2019)","DOI":"10.1109\/ICCV.2019.00852"},{"issue":"4","key":"30_CR57","doi-asserted-by":"publisher","first-page":"717","DOI":"10.1109\/TPAMI.2015.2465960","volume":"38","author":"J Shi","year":"2015","unstructured":"Shi, J., Yan, Q., Xu, L., Jia, J.: Hierarchical image saliency detection on extended CSSD. IEEE Trans. Pattern Anal. Mach. Intell. 38(4), 717\u2013729 (2015)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"30_CR58","unstructured":"Sim\u00e9oni, O., et al.: Localizing objects with self-supervised transformers and no labels. In: British Machine Vision Conference (2021)"},{"key":"30_CR59","unstructured":"Skurowski, P., Abdulameer, H., B\u0142aszczyk, J., Depta, T., Kornacki, A., Kozie\u0142, P.: Animal camouflage analysis: Chameleon database. Unpublished Manuscript 2(6), 7 (2018)"},{"key":"30_CR60","unstructured":"Su, W., et al.: VL-BERT: pre-training of generic visual-linguistic representations. In: International Conference on Learning Representations (2019)"},{"key":"30_CR61","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: VideoBERT: a joint model for video and language representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7464\u20137473 (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"30_CR62","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. In: Conference on Empirical Methods in Natural Language Processing (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"30_CR63","unstructured":"Tan, M., Le, Q.: EfficientNet: rethinking model scaling for convolutional neural networks. In: International Conference on Machine Learning, pp. 6105\u20136114. PMLR (2019)"},{"issue":"2","key":"30_CR64","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1007\/s11263-013-0620-5","volume":"104","author":"JR Uijlings","year":"2013","unstructured":"Uijlings, J.R., Van De Sande, K.E., Gevers, T., Smeulders, A.W.: Selective search for object recognition. Int. J. Comput. Vision 104(2), 154\u2013171 (2013). https:\/\/doi.org\/10.1007\/s11263-013-0620-5","journal-title":"Int. J. Comput. Vision"},{"key":"30_CR65","doi-asserted-by":"crossref","unstructured":"Wang, W., Feiszli, M., Wang, H., Tran, D.: Unidentified video objects: a benchmark for dense, open-world segmentation. arXiv preprint arXiv:2104.04691 (2021)","DOI":"10.1109\/ICCV48922.2021.01060"},{"key":"30_CR66","unstructured":"Wang, X., Huang, T.E., Darrell, T., Gonzalez, J.E., Yu, F.: Frustratingly simple few-shot object detection. arXiv preprint arXiv:2003.06957 (2020)"},{"key":"30_CR67","doi-asserted-by":"crossref","unstructured":"Wang, X., Cai, Z., Gao, D., Vasconcelos, N.: Towards universal object detection by domain attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7289\u20137298 (2019)","DOI":"10.1109\/CVPR.2019.00746"},{"key":"30_CR68","doi-asserted-by":"publisher","unstructured":"Wightman, R.: PyTorch image models (2019). https:\/\/github.com\/rwightman\/pytorch-image-models. https:\/\/doi.org\/10.5281\/zenodo.4414861","DOI":"10.5281\/zenodo.4414861"},{"key":"30_CR69","doi-asserted-by":"crossref","unstructured":"Wu, K., Otoo, E., Shoshani, A.: Optimizing connected component labeling algorithms. In: Medical Imaging 2005: Image Processing, vol. 5747, pp. 1965\u20131976. International Society for Optics and Photonics (2005)","DOI":"10.1117\/12.596105"},{"key":"30_CR70","unstructured":"Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2 (2019). https:\/\/github.com\/facebookresearch\/detectron2"},{"key":"30_CR71","doi-asserted-by":"crossref","unstructured":"Wu, Z., Su, L., Huang, Q.: Cascaded partial decoder for fast and accurate salient object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3907\u20133916 (2019)","DOI":"10.1109\/CVPR.2019.00403"},{"key":"30_CR72","doi-asserted-by":"crossref","unstructured":"Xia, G.S., et al.: DOTA: a large-scale dataset for object detection in aerial images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3974\u20133983 (2018)","DOI":"10.1109\/CVPR.2018.00418"},{"key":"30_CR73","doi-asserted-by":"crossref","unstructured":"Xiao, T., Reed, C.J., Wang, X., Keutzer, K., Darrell, T.: Region similarity representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.01037"},{"key":"30_CR74","doi-asserted-by":"crossref","unstructured":"Xie, E., et al.: DetCo: unsupervised contrastive learning for object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8392\u20138401 (2021)","DOI":"10.1109\/ICCV48922.2021.00828"},{"key":"30_CR75","doi-asserted-by":"crossref","unstructured":"Xie, Q., Luong, M.T., Hovy, E., Le, Q.V.: Self-training with noisy student improves imageNet classification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10687\u201310698 (2020)","DOI":"10.1109\/CVPR42600.2020.01070"},{"issue":"3","key":"30_CR76","doi-asserted-by":"publisher","DOI":"10.1117\/1.JMI.5.3.036501","volume":"5","author":"K Yan","year":"2018","unstructured":"Yan, K., Wang, X., Lu, L., Summers, R.M.: DeepLesion: automated mining of large-scale lesion annotations and universal lesion detection with deep learning. J. Med. Imaging 5(3), 036501 (2018)","journal-title":"J. Med. Imaging"},{"key":"30_CR77","doi-asserted-by":"crossref","unstructured":"Yang, C., Zhang, L., Lu, H., Ruan, X., Yang, M.H.: Saliency detection via graph-based manifold ranking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3166\u20133173 (2013)","DOI":"10.1109\/CVPR.2013.407"},{"key":"30_CR78","doi-asserted-by":"crossref","unstructured":"Zareian, A., Rosa, K.D., Hu, D.H., Chang, S.F.: Open-vocabulary object detection using captions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14393\u201314402 (2021)","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"30_CR79","unstructured":"Zbontar, J., Jing, L., Misra, I., LeCun, Y., Deny, S.: Barlow twins: self-supervised learning via redundancy reduction. In: International Conference on Machine Learning (2021)"},{"key":"30_CR80","doi-asserted-by":"crossref","unstructured":"Zhang, M., Tseng, C., Kreiman, G.: Putting visual object recognition in context. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12985\u201312994 (2020)","DOI":"10.1109\/CVPR42600.2020.01300"},{"key":"30_CR81","doi-asserted-by":"crossref","unstructured":"Zhang, Z., et al.: BING++: a fast high quality object proposal generator at 100fps. In: IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 40, pp. 1209\u20131223 (2018)","DOI":"10.1109\/TPAMI.2017.2707492"},{"key":"30_CR82","doi-asserted-by":"crossref","unstructured":"Zhou, M., et al.: UC2: universal cross-lingual cross-modal vision-and-language pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4155\u20134165 (2021)","DOI":"10.1109\/CVPR46437.2021.00414"},{"key":"30_CR83","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. In: International Conference on Learning Representations (2021)"},{"key":"30_CR84","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"391","DOI":"10.1007\/978-3-319-10602-1_26","volume-title":"Computer Vision \u2013 ECCV 2014","author":"CL Zitnick","year":"2014","unstructured":"Zitnick, C.L., Doll\u00e1r, P.: Edge boxes: locating object proposals from edges. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 391\u2013405. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_26"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-20080-9_30","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,29]],"date-time":"2023-11-29T23:45:21Z","timestamp":1701301521000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-20080-9_30"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031200793","9783031200809"],"references-count":84,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-20080-9_30","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"3 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}