{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,19]],"date-time":"2024-11-19T18:44:58Z","timestamp":1732041898836},"reference-count":84,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,6]]},"DOI":"10.1109\/cvpr52688.2022.00517","type":"proceedings-article","created":{"date-parts":[[2022,9,27]],"date-time":"2022-09-27T19:56:41Z","timestamp":1664308601000},"source":"Crossref","is-referenced-by-count":66,"title":["Winoground: Probing Vision and Language Models for Visio-Linguistic Compositionality"],"prefix":"10.1109","author":[{"given":"Tristan","family":"Thrush","sequence":"first","affiliation":[{"name":"Hugging Face"}]},{"given":"Ryan","family":"Jiang","sequence":"additional","affiliation":[{"name":"University of Waterloo"}]},{"given":"Max","family":"Bartolo","sequence":"additional","affiliation":[{"name":"University College London"}]},{"given":"Amanpreet","family":"Singh","sequence":"additional","affiliation":[{"name":"Hugging Face"}]},{"given":"Adina","family":"Williams","sequence":"additional","affiliation":[{"name":"Facebook AI Research"}]},{"given":"Douwe","family":"Kiela","sequence":"additional","affiliation":[{"name":"Hugging Face"}]},{"given":"Candace","family":"Ross","sequence":"additional","affiliation":[{"name":"Facebook AI Research"}]}],"member":"263","reference":[{"key":"ref73","article-title":"BLiMP: The benchmark of linguistic minimal pairs for English","author":"warstadt","year":"2020","journal-title":"TACL"},{"key":"ref72","article-title":"In-vestigating BERT's knowledge of language: Five analysis methods with NPIs","author":"warstadt","year":"2019","journal-title":"EMNLP-IJCNLP"},{"key":"ref71","article-title":"Curi: A benchmark for productive concept learning under uncertainty","author":"vedantam","year":"2021","journal-title":"ICML"},{"key":"ref70","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"NeurIPS"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1016\/0010-0285(72)90002-3"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00019"},{"key":"ref39","article-title":"Visual genome: Connecting language and vision using crowdsourced dense image annotations","author":"krishna","year":"2016","journal-title":"ArXiv Preprint"},{"key":"ref75","article-title":"A broad-coverage challenge corpus for sentence understanding through inference","author":"williams","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref38","article-title":"A review of winograd schema challenge datasets and approaches","author":"kocijan","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref78","article-title":"Visual entailment task for visually-grounded language learning","author":"xie","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"ref32","author":"iyer","year":"2017","journal-title":"First Quora dataset release Question pairs"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.158"},{"key":"ref37","article-title":"Hatemoji: A test suite and adversarially-generated dataset for benchmarking and detecting emoji-based hate","author":"kirk","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref36","article-title":"Vilt: Vision-and-language transformer without convolution or region supervision","author":"kim","year":"2021","journal-title":"ICML"},{"key":"ref35","article-title":"The hateful memes challenge: Detecting hate speech in multimodal memes","author":"kiela","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref34","article-title":"Verb argument structure alternations in word and sentence embeddings","author":"kann","year":"2019","journal-title":"SCiL"},{"key":"ref60","article-title":"Very deep convolutional networks for largescale image recognition","author":"simonyan","year":"2015","journal-title":"CVPR"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref61","article-title":"Are we pretraining it right? digging deeper into visio-linguistic pretraining","author":"singh","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.230"},{"key":"ref28","article-title":"Detection of cyberbullying incidents on the instagram social network","author":"hosseinmardi","year":"2015","journal-title":"ArXiv Preprint"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.569"},{"key":"ref27","article-title":"Benchmarking neural network robustness to common corruptions and perturbations","author":"hendrycks","year":"2019","journal-title":"ICLRE"},{"key":"ref65","article-title":"Recursive deep models for semantic compositionality over a sentiment treebank","author":"socher","year":"2013","journal-title":"EMNLP"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-2034"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00237"},{"key":"ref67","article-title":"Findings of the shared task on troll meme classification in Tamil","author":"suryawanshi","year":"0","journal-title":"Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages"},{"key":"ref68","article-title":"Lxmert: Learning cross-modality encoder representations from transformers","author":"tan","year":"2020","journal-title":"EMNLP-IJCNLP"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.blackboxnlp-1.25"},{"key":"ref1","article-title":"Terence Parsons, and Roger Schwarzschild","author":"altshuler","year":"2019","journal-title":"Semantics a coursebook"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-demos.10"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093414"},{"key":"ref21","year":"0"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1108"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.318"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref50","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"2021","journal-title":"ICML"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"ref59","article-title":"Textcaps: a dataset for image captioning with reading comprehension","author":"sidorov","year":"2020","journal-title":"ECCV"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1024"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref56","article-title":"Neural machine translation of rare words with subword units","author":"sennrich","year":"2015","journal-title":"ArXiv Preprint"},{"key":"ref55","article-title":"How computers see gender: An evaluation of gender classification in commercial facial analysis services","author":"scheuerman","year":"2019","journal-title":"Human-Computer Interaction"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6399"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-2002"},{"key":"ref52","article-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","author":"ren","year":"2015","journal-title":"NeurIPS"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2011.12.004"},{"key":"ref11","article-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling","author":"chung","year":"2014","journal-title":"NeurIPS"},{"key":"ref40","article-title":"The winograd schema challenge","author":"levesque","year":"0","journal-title":"Conference of Principles of Knowledge Representation and Reasoning"},{"key":"ref12","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"NAACL Human Language Technologies"},{"key":"ref13","article-title":"Understanding image and text simultaneously: a dual vision-language machine comprehension task","author":"ding","year":"2016","journal-title":"ar Xiv preprint"},{"key":"ref14","article-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","author":"dosovitskiy","year":"2021","journal-title":"ICLRE"},{"key":"ref15","article-title":"An empirical study of training end-to-end vision-and-language transformers","author":"dou","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-2003"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.819"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"ref17","article-title":"V se++: Improving visual-semantic embeddings with hard negatives","author":"faghri","year":"2018","journal-title":"BMVC"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.540"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.775"},{"key":"ref83","article-title":"Content-driven detection of cyberbullying on the instagram social network","author":"zhong","year":"2016","journal-title":"IJCAI"},{"key":"ref19","article-title":"Large-scale adversarial training for vision-and-language representation learning","author":"gan","year":"2020","journal-title":"NeurIPS"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00688"},{"key":"ref4","article-title":"Multimodal datasets: misogyny, pornography, and malignant stereotypes","author":"birhane","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref3","article-title":"Establishing a human baseline for the wino-grad schema challenge","author":"bender","year":"2015","journal-title":"Artificial Intelligence and Cognitive Science"},{"key":"ref6","article-title":"Behind the scene: Revealing the secrets of pre-trained vision-and-language models","author":"cao","year":"2020","journal-title":"ECCV"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.9"},{"key":"ref8","article-title":"Being negative but constructively: Lessons learnt from creating better visual question answering datasets","author":"chao","year":"2017","journal-title":"ar Xiv preprint"},{"key":"ref7","article-title":"End-to-end object detection with transformers","author":"carion","year":"2020","journal-title":"ECCV"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.275"},{"key":"ref9","article-title":"Uniter: Universal image-text representation learning","author":"chen","year":"2020","journal-title":"ECCV"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.818"},{"key":"ref45","article-title":"Assessing the ability of lstms to learn syntax-sensitive depen-dencies","author":"linzen","year":"2015","journal-title":"TACL"},{"key":"ref48","article-title":"Im2text: Describing images using 1 million captioned photographs","author":"ordonez","year":"2011","journal-title":"NIPS"},{"key":"ref47","article-title":"ViL-BERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks","author":"lu","year":"2019","journal-title":"NeurIPS"},{"key":"ref42","article-title":"A closer look at the robustness of vision-and-language pre-trained models","author":"li","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"ref44","article-title":"Microsoft coco: Common objects in context","author":"lin","year":"2014","journal-title":"ECCV"},{"key":"ref43","article-title":"VisualBERT: A Simple and Performant Baseline for Vision and Language","author":"li","year":"2019","journal-title":"ar Xiv preprint"}],"event":{"name":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"New Orleans, LA, USA","start":{"date-parts":[[2022,6,18]]},"end":{"date-parts":[[2022,6,24]]}},"container-title":["2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9878378\/9878366\/09878945.pdf?arnumber=9878945","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,14]],"date-time":"2022-10-14T21:02:24Z","timestamp":1665781344000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9878945\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6]]},"references-count":84,"URL":"https:\/\/doi.org\/10.1109\/cvpr52688.2022.00517","relation":{},"subject":[],"published":{"date-parts":[[2022,6]]}}}