{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,8]],"date-time":"2025-04-08T12:50:52Z","timestamp":1744116652725,"version":"3.28.0"},"reference-count":51,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,6]]},"DOI":"10.1109\/cvpr52688.2022.01568","type":"proceedings-article","created":{"date-parts":[[2022,9,27]],"date-time":"2022-09-27T19:56:41Z","timestamp":1664308601000},"page":"16135-16145","source":"Crossref","is-referenced-by-count":12,"title":["Versatile Multi-Modal Pre-Training for Human-Centric Perception"],"prefix":"10.1109","author":[{"given":"Fangzhou","family":"Hong","sequence":"first","affiliation":[{"name":"Nanyang Technological University,S-Lab"}]},{"given":"Liang","family":"Pan","sequence":"additional","affiliation":[{"name":"Nanyang Technological University,S-Lab"}]},{"given":"Zhongang","family":"Cai","sequence":"additional","affiliation":[{"name":"Nanyang Technological University,S-Lab"}]},{"given":"Ziwei","family":"Liu","sequence":"additional","affiliation":[{"name":"Nanyang Technological University,S-Lab"}]}],"member":"263","reference":[{"key":"ref39","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref38","first-page":"5099","article-title":"Pointnet++: Deep hierarchical feature learning on point sets in a metric space","author":"qi","year":"0","journal-title":"Advances in neural information processing systems"},{"key":"ref33","article-title":"Representation learning with contrastive predictive coding","author":"van den oord","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.06.006"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.288"},{"key":"ref30","article-title":"P4contrast: Contrastive learning with pairs of point-pixel pairs for rgb-d scene understanding","author":"liu","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref37","first-page":"652","article-title":"Pointnet: Deep learning on point sets for 3d classification and segmentation","author":"qi","year":"0","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref36","first-page":"5171","article-title":"On variational bounds of mutual information","author":"poole","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00944"},{"key":"ref34","article-title":"Multi-modal self-supervision from generalized data transformations","author":"patrick","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref28","article-title":"Self-emd: Self-supervised object detection without imagenet","author":"liu","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2916873"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00079"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.471"},{"key":"ref1","first-page":"7","article-title":"Self-supervised multimodal versatile networks","volume":"2","author":"alayrac","year":"2020","journal-title":"NeurIPS"},{"key":"ref20","article-title":"Garment4d: Garment reconstruction from point cloud sequences","author":"hong","year":"0","journal-title":"Thirty-Fifth Conference on Neural Information Processing Systems"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.248"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00564"},{"key":"ref24","first-page":"1097","article-title":"Imagenet classification with deep convolutional neural networks","volume":"25","author":"krizhevsky","year":"2012","journal-title":"Advances in neural information processing systems"},{"key":"ref23","article-title":"Supervised contrastive learning","author":"khosla","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref26","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"lin","year":"0","journal-title":"European Conference on Computer Vision"},{"key":"ref25","article-title":"Multiple-human parsing in the wild","author":"li","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00354"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"ref11","first-page":"770","article-title":"Instance-level human parsing via part grouping network","author":"gong","year":"0","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01494"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.715"},{"key":"ref13","article-title":"Bootstrap your own latent: A new approach to self-supervised learning","author":"grill","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00762"},{"key":"ref15","article-title":"Self-supervised co-training for video representation learning","author":"han","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_10"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref19","first-page":"726","article-title":"Graph cuts optimization for multi-limb human segmentation in depth maps","author":"hern\u00e1ndez-vela","year":"0","journal-title":"2012 IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref4","article-title":"Openpose: Realtime multi-person 2d pose estimation using part affinity fields","author":"cao","year":"2019","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"ref3","article-title":"Supervision accelerates pre-training in contrastive semi-supervised learning of visual representations","author":"assran","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref6","article-title":"Improved baselines with momentum contrastive learning","author":"chen","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref5","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","author":"chen","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01311"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.254"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00088"},{"journal-title":"MMPose Contributors Openmmlab pose estimation tool-box and benchmark","year":"2020","key":"ref9"},{"key":"ref46","article-title":"Can semantic labels assist self-supervised visual representation learning?","author":"wei","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref45","first-page":"776","article-title":"Contrastive multiview coding","author":"tian","year":"0","journal-title":"Computer Vision–ECCV 2020 16th European Conference Glasgow UK August 23–28 2020 Proceedings Part XI 16"},{"key":"ref48","first-page":"466","article-title":"Simple baselines for human pose estimation and tracking","author":"xiao","year":"0","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.115"},{"key":"ref41","article-title":"Avlnet: Learning audio-visual language representations from instructional videos","author":"rouditchenko","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00186"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00584"}],"event":{"name":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","start":{"date-parts":[[2022,6,18]]},"location":"New Orleans, LA, USA","end":{"date-parts":[[2022,6,24]]}},"container-title":["2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9878378\/9878366\/09879631.pdf?arnumber=9879631","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,14]],"date-time":"2022-10-14T20:58:35Z","timestamp":1665781115000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9879631\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6]]},"references-count":51,"URL":"https:\/\/doi.org\/10.1109\/cvpr52688.2022.01568","relation":{},"subject":[],"published":{"date-parts":[[2022,6]]}}}