{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T10:01:49Z","timestamp":1740132109142,"version":"3.37.3"},"reference-count":94,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"11","license":[{"start":{"date-parts":[[2023,11,1]],"date-time":"2023-11-01T00:00:00Z","timestamp":1698796800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,11,1]],"date-time":"2023-11-01T00:00:00Z","timestamp":1698796800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,11,1]],"date-time":"2023-11-01T00:00:00Z","timestamp":1698796800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key RD Program of China","award":["2022ZD0160100"]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U20A20183","62021001"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shanghai Committee of Science and Technology","award":["21DZ1100100"]},{"name":"GPU"},{"name":"Supercomputing Center of the USTC"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2023,11,1]]},"DOI":"10.1109\/tpami.2023.3296823","type":"journal-article","created":{"date-parts":[[2023,7,19]],"date-time":"2023-07-19T20:30:03Z","timestamp":1689798603000},"page":"13636-13652","source":"Crossref","is-referenced-by-count":23,"title":["TransVG++: End-to-End Visual Grounding With Language Conditioned Vision Transformer"],"prefix":"10.1109","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9624-7451","authenticated-orcid":false,"given":"Jiajun","family":"Deng","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5808-0889","authenticated-orcid":false,"given":"Zhengyuan","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8286-0105","authenticated-orcid":false,"given":"Daqing","family":"Liu","sequence":"additional","affiliation":[{"name":"JD Explore Academy, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6355-6474","authenticated-orcid":false,"given":"Tianlang","family":"Chen","sequence":"additional","affiliation":[{"name":"Amazon, Seattle, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1690-9836","authenticated-orcid":false,"given":"Wengang","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9046-798X","authenticated-orcid":false,"given":"Yanyong","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2188-3028","authenticated-orcid":false,"given":"Houqiang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9163-2761","authenticated-orcid":false,"given":"Wanli","family":"Ouyang","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligent Laboratory, Shanghai, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2797921"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.493"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123439"},{"article-title":"Real-time referring expression comprehension by single-stage grounding network","year":"2018","author":"Chen","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00478"},{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy","key":"ref8"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.541"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00477"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_23"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00206"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00474"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00997"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2911066"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0965-7"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.470"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00437"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00447"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00479"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0620-5"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01258-8_16"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.122"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2973983"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00205"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00438"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.95"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00430"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01509"},{"article-title":"YOLOv3: An Incremental Improvement","year":"2018","author":"Redmon","key":"ref39"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539960"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2345390"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58529-7_35"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00928"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01506"},{"key":"ref45","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Vaswani"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2010-343"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P15-1150"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref50","first-page":"1691","article-title":"Generative pretraining from pixels","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chen"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00583"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_31"},{"key":"ref53","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Touvron"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_41"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_17"},{"article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"Devlin","key":"ref58"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref61","first-page":"13","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lu"},{"article-title":"VL-BERT: Pre-training of generic visual-linguistic representations","year":"2019","author":"Su","key":"ref62"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr46437.2021.00864"},{"article-title":"Pixel-BERT: Aligning image pixels with text by deep multi-modal transformers","year":"2020","author":"Huang","key":"ref64"},{"key":"ref65","first-page":"5583","article-title":"ViLT: Vision-and-language transformer without convolution or region supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim"},{"key":"ref66","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li"},{"article-title":"SimVLM: Simple visual language model pretraining with weak supervision","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Wang","key":"ref67"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"article-title":"Layer normalization","year":"2016","author":"Ba","key":"ref69"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref71","first-page":"4055","article-title":"Image transformer","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Parmar"},{"article-title":"Universal transformers","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dehghani","key":"ref72"},{"article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","year":"2019","author":"Raffel","key":"ref73"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2023.08.012"},{"key":"ref75","first-page":"2790","article-title":"Parameter-efficient transfer learning for NLP","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Houlsby"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.39"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2009.03.008"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"ref80","first-page":"19652","article-title":"Referring transformer: A one-step approach to multi-task visual grounding","volume-title":"Proc. Adv. Neural Inf. Proc. Syst.","author":"Li"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859880"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6833"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20870-7_28"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/155"},{"article-title":"Semantic image segmentation with deep convolutional nets and fully connected CRFs","year":"2014","author":"Chen","key":"ref87"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"ref89","first-page":"13937","article-title":"DynamicViT: Efficient vision transformers with dynamic token sparsification","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Rao"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16188"},{"key":"ref91","first-page":"12991","article-title":"LST: Ladder side-tuning for parameter and memory efficient transfer learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Sung","year":"2022"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01838"},{"key":"ref93","first-page":"8583","article-title":"Scaling vision with sparse mixture of experts","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Riquelme"},{"article-title":"Pix2Seq: A language modeling framework for object detection","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chen","key":"ref94"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/10269680\/10187690.pdf?arnumber=10187690","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,12]],"date-time":"2024-01-12T01:55:19Z","timestamp":1705024519000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10187690\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,1]]},"references-count":94,"journal-issue":{"issue":"11"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2023.3296823","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"type":"print","value":"0162-8828"},{"type":"electronic","value":"2160-9292"},{"type":"electronic","value":"1939-3539"}],"subject":[],"published":{"date-parts":[[2023,11,1]]}}}