{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T12:40:38Z","timestamp":1732192838812,"version":"3.28.0"},"reference-count":45,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102285","62272342","62020106004","92048301"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Vision and Image Understanding"],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1016\/j.cviu.2024.104220","type":"journal-article","created":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T08:20:09Z","timestamp":1730708409000},"page":"104220","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Monocular depth estimation with boundary attention mechanism and Shifted Window Adaptive Bins"],"prefix":"10.1016","volume":"249","author":[{"given":"Hengjia","family":"Hu","sequence":"first","affiliation":[]},{"given":"Mengnan","family":"Liang","sequence":"additional","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0002-5661-8796","authenticated-orcid":false,"given":"Congcong","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Meng","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Fan","family":"Shi","sequence":"additional","affiliation":[]},{"given":"Chao","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Yilin","family":"Han","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.cviu.2024.104220_b1","doi-asserted-by":"crossref","unstructured":"Agarwal,\u00a0A., Arora,\u00a0C., 2023. Attention Attention Everywhere: Monocular Depth Prediction with Skip Attention. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. pp. 5861\u20135870.","DOI":"10.1109\/WACV56688.2023.00581"},{"key":"10.1016\/j.cviu.2024.104220_b2","unstructured":"Bhat,\u00a0S.F., Alhashim,\u00a0I., Wonka,\u00a0P., 2021. Adabins: Depth estimation using adaptive bins. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 4009\u20134018."},{"key":"10.1016\/j.cviu.2024.104220_b3","series-title":"Proceedings of the Twenty-Eighth International Joint Conference on Artificial Intelligence, IJCAI 2019, Macao, China, August 10-16, 2019","first-page":"694","article-title":"Structure-aware residual pyramid network for monocular depth estimation","author":"Chen","year":"2019"},{"issue":"4","key":"10.1016\/j.cviu.2024.104220_b4","doi-asserted-by":"crossref","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","article-title":"Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs","volume":"40","author":"Chen","year":"2017","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.cviu.2024.104220_b5","series-title":"2017 IEEE\/RSJ International Conference on Intelligent Robots and Systems","first-page":"1505","article-title":"Joint prediction of depths, normals and surface curvature from rgb images using cnns","author":"Dharmasiri","year":"2017"},{"year":"2020","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","key":"10.1016\/j.cviu.2024.104220_b6"},{"key":"10.1016\/j.cviu.2024.104220_b7","doi-asserted-by":"crossref","unstructured":"Eigen,\u00a0D., Fergus,\u00a0R., 2015. Predicting depth, surface normals and semantic labels with a common multi-scale convolutional architecture. In: Proceedings of the IEEE International Conference on Computer Vision. pp. 2650\u20132658.","DOI":"10.1109\/ICCV.2015.304"},{"key":"10.1016\/j.cviu.2024.104220_b8","article-title":"Depth map prediction from a single image using a multi-scale deep network","volume":"27","author":"Eigen","year":"2014","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.cviu.2024.104220_b9","doi-asserted-by":"crossref","unstructured":"Fu,\u00a0H., Gong,\u00a0M., Wang,\u00a0C., Batmanghelich,\u00a0K., Tao,\u00a0D., 2018. Deep ordinal regression network for monocular depth estimation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 2002\u20132011.","DOI":"10.1109\/CVPR.2018.00214"},{"key":"10.1016\/j.cviu.2024.104220_b10","doi-asserted-by":"crossref","unstructured":"Gan,\u00a0Y., Xu,\u00a0X., Sun,\u00a0W., Lin,\u00a0L., 2018. Monocular depth estimation with affinity, vertical pooling, and label enhancement. In: Proceedings of the European Conference on Computer Vision. ECCV, pp. 224\u2013239.","DOI":"10.1007\/978-3-030-01219-9_14"},{"key":"10.1016\/j.cviu.2024.104220_b11","series-title":"2012 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"3354","article-title":"Are we ready for autonomous driving? the kitti vision benchmark suite","author":"Geiger","year":"2012"},{"key":"10.1016\/j.cviu.2024.104220_b12","doi-asserted-by":"crossref","unstructured":"Guizilini,\u00a0V., Ambrus,\u00a0R., Burgard,\u00a0W., Gaidon,\u00a0A., 2021. Sparse auxiliary networks for unified monocular depth prediction and completion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 11078\u201311088.","DOI":"10.1109\/CVPR46437.2021.01093"},{"key":"10.1016\/j.cviu.2024.104220_b13","doi-asserted-by":"crossref","unstructured":"He,\u00a0K., Zhang,\u00a0X., Ren,\u00a0S., Sun,\u00a0J., 2016. Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.cviu.2024.104220_b14","series-title":"2019 IEEE Winter Conference on Applications of Computer Vision","first-page":"1043","article-title":"Revisiting single image depth estimation: Toward higher resolution maps with accurate object boundaries","author":"Hu","year":"2019"},{"key":"10.1016\/j.cviu.2024.104220_b15","doi-asserted-by":"crossref","unstructured":"Hu,\u00a0J., Shen,\u00a0L., Sun,\u00a0G., 2018. Squeeze-and-excitation networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 7132\u20137141.","DOI":"10.1109\/CVPR.2018.00745"},{"key":"10.1016\/j.cviu.2024.104220_b16","doi-asserted-by":"crossref","unstructured":"Huang,\u00a0G., Liu,\u00a0Z., Van Der\u00a0Maaten,\u00a0L., Weinberger,\u00a0K.Q., 2017. Densely connected convolutional networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 4700\u20134708.","DOI":"10.1109\/CVPR.2017.243"},{"key":"10.1016\/j.cviu.2024.104220_b17","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXVI 16","first-page":"581","article-title":"Guiding monocular depth estimation using depth-attention volume","author":"Huynh","year":"2020"},{"year":"2014","series-title":"Adam: A method for stochastic optimization","author":"Kingma","key":"10.1016\/j.cviu.2024.104220_b18"},{"key":"10.1016\/j.cviu.2024.104220_b19","series-title":"2016 Fourth International Conference on 3D Vision","first-page":"239","article-title":"Deeper depth prediction with fully convolutional residual networks","author":"Laina","year":"2016"},{"year":"2019","series-title":"From big to small: Multi-scale local planar guidance for monocular depth estimation","author":"Lee","key":"10.1016\/j.cviu.2024.104220_b20"},{"key":"10.1016\/j.cviu.2024.104220_b21","doi-asserted-by":"crossref","unstructured":"Lee,\u00a0M., Hwang,\u00a0S., Park,\u00a0C., Lee,\u00a0S., 2022. Edgeconv with attention module for monocular depth estimation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. pp. 2858\u20132867.","DOI":"10.1109\/WACV51458.2022.00242"},{"key":"10.1016\/j.cviu.2024.104220_b22","doi-asserted-by":"crossref","first-page":"343","DOI":"10.1016\/j.neucom.2020.11.002","article-title":"Attention based multilayer feature fusion convolutional neural network for unsupervised monocular depth estimation","volume":"423","author":"Lei","year":"2021","journal-title":"Neurocomputing"},{"year":"2022","series-title":"Depthformer: Exploiting long-range correlation and local information for accurate monocular depth estimation","author":"Li","key":"10.1016\/j.cviu.2024.104220_b23"},{"year":"2022","series-title":"Binsformer: Revisiting adaptive bins for monocular depth estimation","author":"Li","key":"10.1016\/j.cviu.2024.104220_b24"},{"key":"10.1016\/j.cviu.2024.104220_b25","doi-asserted-by":"crossref","unstructured":"Lin,\u00a0T.-Y., Doll\u00e1r,\u00a0P., Girshick,\u00a0R., He,\u00a0K., Hariharan,\u00a0B., Belongie,\u00a0S., 2017. Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 2117\u20132125.","DOI":"10.1109\/CVPR.2017.106"},{"key":"10.1016\/j.cviu.2024.104220_b26","doi-asserted-by":"crossref","unstructured":"Liu,\u00a0Z., Lin,\u00a0Y., Cao,\u00a0Y., Hu,\u00a0H., Wei,\u00a0Y., Zhang,\u00a0Z., Lin,\u00a0S., Guo,\u00a0B., 2021a. Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"10.1016\/j.cviu.2024.104220_b27","series-title":"2020 25th International Conference on Pattern Recognition","first-page":"5137","article-title":"Multi-scale residual pyramid attention network for monocular depth estimation","author":"Liu","year":"2021"},{"key":"10.1016\/j.cviu.2024.104220_b28","series-title":"2018 IEEE International Conference on Robotics and Automation","first-page":"4796","article-title":"Sparse-to-dense: Depth prediction from sparse depth samples and a single image","author":"Ma","year":"2018"},{"key":"10.1016\/j.cviu.2024.104220_b29","doi-asserted-by":"crossref","unstructured":"Naderi,\u00a0T., Sadovnik,\u00a0A., Hayward,\u00a0J., Qi,\u00a0H., 2022. Monocular depth estimation with adaptive geometric attention. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. pp. 944\u2013954.","DOI":"10.1109\/WACV51458.2022.00069"},{"key":"10.1016\/j.cviu.2024.104220_b30","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume":"32","author":"Paszke","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.cviu.2024.104220_b31","doi-asserted-by":"crossref","unstructured":"Patil,\u00a0V., Sakaridis,\u00a0C., Liniger,\u00a0A., Van\u00a0Gool,\u00a0L., 2022. P3depth: Monocular depth estimation with a piecewise planarity prior. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1610\u20131621.","DOI":"10.1109\/CVPR52688.2022.00166"},{"key":"10.1016\/j.cviu.2024.104220_b32","doi-asserted-by":"crossref","unstructured":"Ranftl,\u00a0R., Bochkovskiy,\u00a0A., Koltun,\u00a0V., 2021. Vision transformers for dense prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 12179\u201312188.","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"10.1016\/j.cviu.2024.104220_b33","series-title":"Medical Image Computing and Computer-Assisted Intervention\u2013MICCAI 2015: 18th International Conference, Munich, Germany, October 5-9, 2015, Proceedings, Part III 18","first-page":"234","article-title":"U-net: Convolutional networks for biomedical image segmentation","author":"Ronneberger","year":"2015"},{"key":"10.1016\/j.cviu.2024.104220_b34","article-title":"Learning depth from single monocular images","volume":"18","author":"Saxena","year":"2005","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.cviu.2024.104220_b35","series-title":"ECCV (5)","first-page":"746","article-title":"Indoor segmentation and support inference from rgbd images","volume":"Vol. 7576","author":"Silberman","year":"2012"},{"key":"10.1016\/j.cviu.2024.104220_b36","series-title":"2015 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"567","article-title":"SUN RGB-D: A RGB-D scene understanding benchmark suite","author":"Song","year":"2015"},{"key":"10.1016\/j.cviu.2024.104220_b37","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109982","article-title":"CATNet: Convolutional attention and transformer for monocular depth estimation","volume":"145","author":"Tang","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.cviu.2024.104220_b38","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.cviu.2024.104220_b39","series-title":"European Conference on Computer Vision","first-page":"316","article-title":"Cliffnet for monocular depth estimation with hierarchical embedding loss","author":"Wang","year":"2020"},{"key":"10.1016\/j.cviu.2024.104220_b40","doi-asserted-by":"crossref","unstructured":"Woo,\u00a0S., Park,\u00a0J., Lee,\u00a0J.-Y., Kweon,\u00a0I.S., 2018. Cbam: Convolutional block attention module. In: Proceedings of the European Conference on Computer Vision. ECCV, pp. 3\u201319.","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"10.1016\/j.cviu.2024.104220_b41","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2022.104520","article-title":"Rich global feature guided network for monocular depth estimation","volume":"125","author":"Wu","year":"2022","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.cviu.2024.104220_b42","series-title":"Transformers solve the limited receptive field for monocular depth prediction","first-page":"5","author":"Yang","year":"2021"},{"key":"10.1016\/j.cviu.2024.104220_b43","doi-asserted-by":"crossref","unstructured":"Yin,\u00a0W., Liu,\u00a0Y., Shen,\u00a0C., Yan,\u00a0Y., 2019. Enforcing geometric constraints of virtual normal for depth prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 5684\u20135693.","DOI":"10.1109\/ICCV.2019.00578"},{"year":"2022","series-title":"New crfs: Neural window fully-connected crfs for monocular depth estimation","author":"Yuan","key":"10.1016\/j.cviu.2024.104220_b44"},{"key":"10.1016\/j.cviu.2024.104220_b45","doi-asserted-by":"crossref","unstructured":"Zhao,\u00a0H., Shi,\u00a0J., Qi,\u00a0X., Wang,\u00a0X., Jia,\u00a0J., 2017. Pyramid scene parsing network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 2881\u20132890.","DOI":"10.1109\/CVPR.2017.660"}],"container-title":["Computer Vision and Image Understanding"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1077314224003011?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1077314224003011?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T11:47:04Z","timestamp":1732189624000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1077314224003011"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":45,"alternative-id":["S1077314224003011"],"URL":"https:\/\/doi.org\/10.1016\/j.cviu.2024.104220","relation":{},"ISSN":["1077-3142"],"issn-type":[{"type":"print","value":"1077-3142"}],"subject":[],"published":{"date-parts":[[2024,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Monocular depth estimation with boundary attention mechanism and Shifted Window Adaptive Bins","name":"articletitle","label":"Article Title"},{"value":"Computer Vision and Image Understanding","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.cviu.2024.104220","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 Elsevier Inc. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104220"}}