{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,8,27]],"date-time":"2024-08-27T21:45:27Z","timestamp":1724795127620},"reference-count":52,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2017,4,1]],"date-time":"2017-04-01T00:00:00Z","timestamp":1491004800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2017,4]]},"DOI":"10.1016\/j.neunet.2017.01.003","type":"journal-article","created":{"date-parts":[[2017,1,30]],"date-time":"2017-01-30T19:30:39Z","timestamp":1485804639000},"page":"105-113","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":20,"special_numbering":"C","title":["Object class segmentation of RGB-D video using recurrent convolutional neural networks"],"prefix":"10.1016","volume":"88","author":[{"given":"Mircea Serban","family":"Pavel","sequence":"first","affiliation":[]},{"given":"Hannes","family":"Schulz","sequence":"additional","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0002-5040-7525","authenticated-orcid":false,"given":"Sven","family":"Behnke","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2017.01.003_br000005","doi-asserted-by":"crossref","unstructured":"Batra, D., Sukthankar, R., & Chen, T. (2008). Learning class-specific affinities for image labelling. In Conference on computer vision and pattern recognition, CVPR.","DOI":"10.1109\/CVPR.2008.4587432"},{"key":"10.1016\/j.neunet.2017.01.003_br000010","series-title":"Hierarchical neural networks for image interpretation","volume":"Vol. 2766","author":"Behnke","year":"2003"},{"key":"10.1016\/j.neunet.2017.01.003_br000015","unstructured":"Bogun, I., Angelova, A., & Jaitly, N. (2015). Object recognition from short videos for robotic perception. arXiv preprint arXiv:1509.01602."},{"key":"10.1016\/j.neunet.2017.01.003_br000020","doi-asserted-by":"crossref","DOI":"10.1109\/TPAMI.2010.143","article-title":"Large displacement optical flow: descriptor matching in variational motion estimation","volume":"33","author":"Brox","year":"2011","journal-title":"Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.neunet.2017.01.003_br000025","unstructured":"Chen, L.-C., Papandreou, G., Kokkinos, I., Murphy, K., & Yuille, A.L. (2014). Semantic image segmentation with deep convolutional nets and fully connected crfs. arXiv preprint arXiv:1412.7062."},{"key":"10.1016\/j.neunet.2017.01.003_br000030","unstructured":"Chen, L.-C., Papandreou, G., Kokkinos, I., Murphy, K., & Yuille, A. L. (2015). Semantic image segmentation with deep convolutional nets and fully connected CRFs. In International conference on learning representations, ICLR."},{"key":"10.1016\/j.neunet.2017.01.003_br000035","unstructured":"Ciresan, D., Giusti, A., Gambardella, L. M., & Schmidhuber, J. (2012). Deep neural networks segment neuronal membranes in electron microscopy images. In Advances in neural information processing systems, NIPS."},{"key":"10.1016\/j.neunet.2017.01.003_br000040","unstructured":"Couprie, C., Farabet, C., Najman, L., & LeCun, Y. (2013). Indoor semantic segmentation using depth information. In International conference on learning representations, ICLR."},{"key":"10.1016\/j.neunet.2017.01.003_br000045","unstructured":"Dauphin, Y.N., de Vries, H., Chung, J., & Bengio, Y. (2015). Rmsprop and equilibrated adaptive learning rates for non-convex optimization. arXiv preprint arXiv:1502.04390."},{"key":"10.1016\/j.neunet.2017.01.003_br000050","doi-asserted-by":"crossref","unstructured":"Eigen, D., & Fergus, R. (2015). Predicting depth, surface normals and semantic labels with a common multi-scale convolutional architecture. In International conference on computer vision, ICCV.","DOI":"10.1109\/ICCV.2015.304"},{"key":"10.1016\/j.neunet.2017.01.003_br000055","first-page":"625","article-title":"Why does unsupervised pre-training help deep learning?","volume":"11","author":"Erhan","year":"2010","journal-title":"Journal of Machine Learning Research (JMLR)"},{"key":"10.1016\/j.neunet.2017.01.003_br000060","series-title":"Image analysis","article-title":"Two-frame motion estimation based on polynomial expansion","author":"Farneb\u00e4ck","year":"2003"},{"key":"10.1016\/j.neunet.2017.01.003_br000065","series-title":"Supervised sequence labelling with recurrent neural networks","volume":"Vol. 385","author":"Graves","year":"2012"},{"key":"10.1016\/j.neunet.2017.01.003_br000070","doi-asserted-by":"crossref","unstructured":"Graves, A., Abdelrahman, M., & Hinton, G. E. (2013). Speech recognition with deep recurrent neural networks. In International conference on acoustics, speech and signal processing, ICASSP.","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"10.1016\/j.neunet.2017.01.003_br000075","doi-asserted-by":"crossref","unstructured":"Gupta, S., Girshick, R., Arbel\u00e1ez, P., & Malik, J. (2014). Learning rich features from rgb-d images for object detection and segmentation. In European conference on computer vision, ECCV.","DOI":"10.1007\/978-3-319-10584-0_23"},{"key":"10.1016\/j.neunet.2017.01.003_br000080","unstructured":"He, X., Zemel, R. S., & Carreira-Perpi\u00f1\u00e1n, M. \u00c1. (2004). Multiscale conditional random fields for image labeling. In Conference on computer vision and pattern recognition, CVPR."},{"key":"10.1016\/j.neunet.2017.01.003_br000085","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2015). Delving deep into rectifiers: Surpassing human-level performance on imagenet classification. arXiv preprint arXiv:1502.01852."},{"key":"10.1016\/j.neunet.2017.01.003_br000090","doi-asserted-by":"crossref","unstructured":"H\u00f6ft, N., Schulz, H., & Behnke, S. (2014). Fast semantic segmentation of RGB-D scenes with GPU-accelerated deep neural networks. In German conference on artificial intelligence, KI.","DOI":"10.1007\/978-3-319-11206-0_9"},{"key":"10.1016\/j.neunet.2017.01.003_br000095","doi-asserted-by":"crossref","unstructured":"Jung, M., Hwang, J., & Tani, J. (2014). Multiple spatio-temporal scales neural network for contextual visual recognition of human actions. In International conference on development and learning and on epigenetic robotics, ICDL.","DOI":"10.1109\/DEVLRN.2014.6982987"},{"key":"10.1016\/j.neunet.2017.01.003_br000100","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., & Fei-Fei, L. (2014). Large-scale video classification with convolutional neural networks. In Conference on computer vision and pattern recognition, CVPR.","DOI":"10.1109\/CVPR.2014.223"},{"key":"10.1016\/j.neunet.2017.01.003_br000105","unstructured":"Konda, K.R., Memisevic, R., & Michalski, V. (2013). Learning to encode motion using spatio-temporal synchrony. arXiv preprint arXiv:1306.3162."},{"key":"10.1016\/j.neunet.2017.01.003_br000110","doi-asserted-by":"crossref","unstructured":"Lai, K., Bo, L., Ren, X., & Fox, D. (2011). A large-scale hierarchical multi-view rgb-d object dataset. In International conference on robotics and automation, ICRA.","DOI":"10.1109\/ICRA.2011.5980382"},{"key":"10.1016\/j.neunet.2017.01.003_br000115","doi-asserted-by":"crossref","unstructured":"Le, Q. V., Zou, W. Y., Yeung, S. Y., & Ng, A. Y. (2011). Learning hierarchical invariant spatio-temporal features for action recognition with independent subspace analysis. In Conference on computer vision and pattern recognition, CVPR.","DOI":"10.1109\/CVPR.2011.5995496"},{"issue":"11","key":"10.1016\/j.neunet.2017.01.003_br000120","doi-asserted-by":"crossref","first-page":"2278","DOI":"10.1109\/5.726791","article-title":"Gradient-based learning applied to document recognition","volume":"86","author":"LeCun","year":"1998","journal-title":"Proceedings of the IEEE"},{"key":"10.1016\/j.neunet.2017.01.003_br000125","doi-asserted-by":"crossref","unstructured":"Levin, A., Lischinski, D., & Weiss, Y. (2004). Colorization using optimization. In Special interest group on graphics and interactive techniques, SIGGRAPH.","DOI":"10.1145\/1186562.1015780"},{"key":"10.1016\/j.neunet.2017.01.003_br000130","doi-asserted-by":"crossref","unstructured":"Liang, M., & Hu, X. (2015). Recurrent convolutional neural network for object recognition. In Conference on Computer Vision and Pattern Recognition, CVPR, June.","DOI":"10.1109\/CVPR.2015.7298958"},{"key":"10.1016\/j.neunet.2017.01.003_br000135","doi-asserted-by":"crossref","unstructured":"Liu, Z., Li, X., Luo, P., Loy, C. C., & Tang, X. (2015). Semantic image segmentation via deep parsing network. In International conference on computer vision, ICCV.","DOI":"10.1109\/ICCV.2015.162"},{"key":"10.1016\/j.neunet.2017.01.003_br000140","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., & Darrell, T. (2015a). Fully convolutional networks for semantic segmentation. In Conference on computer vision and pattern recognition, CVPR.","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"10.1016\/j.neunet.2017.01.003_br000145","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., & Darrell, T. (2015b). Fully convolutional networks for semantic segmentation. In Conference on computer vision and pattern recognition, CVPR.","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"10.1016\/j.neunet.2017.01.003_br000150","unstructured":"Michalski, V., Memisevic, R., & Konda, K. (2014). Modeling deep temporal dependencies with recurrent grammar cells. In Advances in neural information processing systems, NIPS."},{"key":"10.1016\/j.neunet.2017.01.003_br000155","doi-asserted-by":"crossref","unstructured":"M\u00fcller, A.C., & Behnke, S. (2014). Learning depth-sensitive conditional random fields for semantic segmentation of rgb-d images. In International conference on robotics and automation, ICRA.","DOI":"10.1109\/ICRA.2014.6907778"},{"key":"10.1016\/j.neunet.2017.01.003_br000160","doi-asserted-by":"crossref","unstructured":"Noh, H., Hong, S., & Han, B. (2015). Learning deconvolution network for semantic segmentation. In International conference on computer vision, ICCV.","DOI":"10.1109\/ICCV.2015.178"},{"key":"10.1016\/j.neunet.2017.01.003_br000165","first-page":"1310","article-title":"On the difficulty of training recurrent neural networks","volume":"28","author":"Pascanu","year":"2013","journal-title":"Journal of Machine Learning Research (JMLR)"},{"key":"10.1016\/j.neunet.2017.01.003_br000170","doi-asserted-by":"crossref","unstructured":"Pham, V., Bluche, T., Kermorvant, C., & Louradour, J. (2014). Dropout improves recurrent neural networks for handwriting recognition. In International conference on frontiers in handwriting recognition, ICFHR.","DOI":"10.1109\/ICFHR.2014.55"},{"key":"10.1016\/j.neunet.2017.01.003_br000175","unstructured":"Pinheiro, P. H., & Collobert, R. (2014). Recurrent convolutional neural networks for scene labeling. In International conference on machine learning, ICML."},{"key":"10.1016\/j.neunet.2017.01.003_br000180","doi-asserted-by":"crossref","unstructured":"Riedmiller, M., & Braun, H. (1993). A direct adaptive method for faster backpropagation learning: The RPROP algorithm. In International conference on neural networks.","DOI":"10.1109\/ICNN.1993.298623"},{"key":"10.1016\/j.neunet.2017.01.003_br000185","unstructured":"Russell, C., Kohli, P., & Torr, P. H. et al. (2009). Associative hierarchical crfs for object class image segmentation. In International conference on computer vision, ICCV."},{"key":"10.1016\/j.neunet.2017.01.003_br000190","unstructured":"Schulz, H., & Behnke, S. (2012). Learning object-class segmentation with convolutional neural networks. In European symposium on artificial neural networks, ESANN."},{"key":"10.1016\/j.neunet.2017.01.003_br000195","unstructured":"Schulz, H., H\u00f6ft, N., & Behnke, S. (2015a). Depth and height aware semantic RGB-D perception with convolutional neural networks. In European symposium on artificial neural networks, ESANN."},{"key":"10.1016\/j.neunet.2017.01.003_br000200","doi-asserted-by":"crossref","unstructured":"Schulz, H., Waldvogel, B., Sheikh, R., & Behnke, S. (2015b). CURFIL: Random forests for image labeling on GPU. In International conference on computer vision theory and applications, VISAPP.","DOI":"10.5220\/0005316201560164"},{"key":"10.1016\/j.neunet.2017.01.003_br000205","unstructured":"Sermanet, P., Eigen, D., Zhang, X., Mathieu, M., Fergus, R., & LeCun, Y. (2014). Overfeat: Integrated recognition, localization and detection using convolutional networks. In International conference on learning representations, ICLR."},{"key":"10.1016\/j.neunet.2017.01.003_br000210","doi-asserted-by":"crossref","unstructured":"Sharif Razavian, A., Azizpour, H., Sullivan, J., & Carlsson, S. (2014). Cnn features off-the-shelf: an astounding baseline for recognition. In Computer vision and pattern recognition (CVPR) workshops.","DOI":"10.1109\/CVPRW.2014.131"},{"key":"10.1016\/j.neunet.2017.01.003_br000215","doi-asserted-by":"crossref","DOI":"10.1145\/2398356.2398381","article-title":"Real-time human pose recognition in parts from single depth images","author":"Shotton","year":"2013","journal-title":"Communications of the ACM"},{"key":"10.1016\/j.neunet.2017.01.003_br000220","doi-asserted-by":"crossref","unstructured":"Shotton, J., Winn, J., Rother, C., & Criminisi, A. (2006). Textonboost: Joint appearance, shape and context modeling for multi-class object recognition and segmentation. In European conference on computer vision, ECCV.","DOI":"10.1007\/11744023_1"},{"key":"10.1016\/j.neunet.2017.01.003_br000225","doi-asserted-by":"crossref","unstructured":"Silberman, N., Hoiem, D., Kohli, P., & Fergus, R. (2012). Indoor segmentation and support inference from RGBD images. In European conference on computer vision, ECCV.","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"10.1016\/j.neunet.2017.01.003_br000230","unstructured":"Simonyan, K., & Zisserman, A. (2014). Two-stream convolutional networks for action recognition in videos. In Advances in neural information processing systems, NIPS."},{"key":"10.1016\/j.neunet.2017.01.003_br000235","unstructured":"Sohn, K., Yan, X., & Lee, H. (2015). Learning structured output representation using deep conditional generative models. In Advances in neural information processing systems, NIPS."},{"key":"10.1016\/j.neunet.2017.01.003_br000240","first-page":"1929","article-title":"Dropout: A simple way to prevent neural networks from overfitting","volume":"15","author":"Srivastava","year":"2014","journal-title":"Journal of Machine Learning Research (JMLR)"},{"key":"10.1016\/j.neunet.2017.01.003_br000245","article-title":"Dense real-time mapping of object-class semantics from RGB-D video","author":"St\u00fcckler","year":"2013","journal-title":"Journal of Real-Time Image Processing"},{"key":"10.1016\/j.neunet.2017.01.003_br000250","doi-asserted-by":"crossref","unstructured":"Sundermeyer, M., Schl\u00fcter, R., & Ney, H. (2012). LSTM neural networks for language modeling. In Interspeech.","DOI":"10.21437\/Interspeech.2012-65"},{"key":"10.1016\/j.neunet.2017.01.003_br000255","series-title":"European conference on computer vision","article-title":"Convolutional learning of spatio-temporal features","author":"Taylor","year":"2010"},{"key":"10.1016\/j.neunet.2017.01.003_br000260","doi-asserted-by":"crossref","unstructured":"Zheng, S., Jayasumana, S., Romera-Paredes, B., Vineet, V., Su, Z., & Du, D. et al. (2015). Conditional random fields as recurrent neural networks. In International conference on computer vision, ICCV.","DOI":"10.1109\/ICCV.2015.179"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608017300035?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608017300035?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2022,7,23]],"date-time":"2022-07-23T01:35:37Z","timestamp":1658540137000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608017300035"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,4]]},"references-count":52,"alternative-id":["S0893608017300035"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2017.01.003","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2017,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Object class segmentation of RGB-D video using recurrent convolutional neural networks","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2017.01.003","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2017 Elsevier Ltd. All rights reserved.","name":"copyright","label":"Copyright"}]}}