{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T14:37:26Z","timestamp":1730212646696,"version":"3.28.0"},"reference-count":95,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1109\/cvpr52729.2023.01409","type":"proceedings-article","created":{"date-parts":[[2023,8,22]],"date-time":"2023-08-22T17:30:52Z","timestamp":1692725452000},"page":"14663-14674","source":"Crossref","is-referenced-by-count":3,"title":["Egocentric Auditory Attention Localization in Conversations"],"prefix":"10.1109","author":[{"given":"Fiona","family":"Ryan","sequence":"first","affiliation":[{"name":"Georgia Institute of Technology"}]},{"given":"Hao","family":"Jiang","sequence":"additional","affiliation":[{"name":"Meta Reality Labs Research"}]},{"given":"Abhinav","family":"Shukla","sequence":"additional","affiliation":[{"name":"Meta Reality Labs Research"}]},{"given":"James M.","family":"Rehg","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology"}]},{"given":"Vamsi Krishna","family":"Ithapu","sequence":"additional","affiliation":[{"name":"Meta Reality Labs Research"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58523-5_13"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2016.7900087"},{"key":"ref3","first-page":"24206","article-title":"Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text","volume":"34","author":"Akbari","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuroimage.2015.09.048"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/DICTA47822.2019.8945893"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/2070481.2070527"},{"key":"ref7","first-page":"25","article-title":"Self-supervised multimodal versatile networks","volume":"33","author":"Alayrac","year":"2020","journal-title":"Advances in Neu-ral Information Processing Systems"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01248"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00033"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_8"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.3389\/fnins.2019.00153"},{"key":"ref12","first-page":"9758","article-title":"Self-supervised learning by cross-modal audio-video clustering","volume":"33","author":"Alwassel","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2953020"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2850284"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuroimage.2019.116283"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TNSRE.2016.2571900"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1088\/1741-2560\/13\/6\/066004"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1080\/17470215608416814"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2818346.2820780"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"ref24","article-title":"Triantafyl-los Afouras, and Andrew Zisserman. Spot the conver-sation: speaker diarisation in the wild","author":"Chung","year":"2020","journal-title":"arXiv preprint"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2000.871073"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746991"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1205381109"},{"key":"ref28","article-title":"Easycom: An augmented real-ity dataset to support algorithms for easy communication in noisy environments","author":"Donley","year":"2021","journal-title":"arXiv preprint"},{"key":"ref29","article-title":"An image is worth 16x16 words: Trans-formers for image recognition at scale","author":"Dosovitskiy","year":"2020","journal-title":"arXiv preprint"},{"key":"ref30","first-page":"1206","article-title":"Using au-ditory saliency to understand complex auditory scenes","volume-title":"2007 15th European Signal Processing Conference","author":"Duangudom"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2008.04.018"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247805"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuroimage.2017.04.026"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.881678"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP.2014.6958874"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-021-94876-0"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/j.bspc.2017.08.008"},{"key":"ref39","first-page":"117","article-title":"Towards speaker detection using lips movements for humanmachine multi-party dialogue","volume-title":"The XXVth Swedish Phonetics Conference (FONETIK)","author":"Haider"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1088\/1741-2560\/11\/4\/046015"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2019.00947"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3137988"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3007841"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_46"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054137"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01029"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2007-44"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CISS.2012.6310945"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.3389\/fnhum.2014.00327"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1098\/rstb.2016.0101"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1016\/j.cub.2005.09.040"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2013.11.010"},{"key":"ref53","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014","journal-title":"arXiv preprint"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00123"},{"key":"ref55","article-title":"Coopera-tive learning of audio and video models from self-supervised synchronization","volume":"31","author":"Korbar","year":"2018","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1121\/10.0006750"},{"volume-title":"The ami meeting corpus","year":"2005","author":"Kraaij","key":"ref57"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.3389\/fphys.2021.700655"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01879-7"},{"key":"ref60","article-title":"Visualbert: A simple and perfor-mant baseline for vision and language","author":"Li","year":"2019","journal-title":"arXiv preprint"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.399"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2021.3051319"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20073"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1177\/23312165221097789"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1038\/nature11020"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_22"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1088\/1741-2560\/12\/4\/046007"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_13"},{"key":"ref69","first-page":"14200","article-title":"Attention bottlenecks for multimodal fusion","volume":"34","author":"Nagrani","year":"2021","journal-title":"Advances in Neural Information Pro-cessing Systems"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01975"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1093\/cercor\/bht355"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_48"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1088\/1741-2552\/aa7ab4"},{"key":"ref75","article-title":"Multi-modal self-supervision from generalized data transformations","author":"Patrick","year":"2020","journal-title":"arXiv preprint"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2020.2996412"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_18"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053900"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2005.251"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00458"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747867"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475587"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00035"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1145\/3462244.3479954"},{"key":"ref85","article-title":"The loud bird doesnt (always) get the worm: Why computational salience also needs brightness and tempo","author":"Tordini","year":"2015","journal-title":"Georgia Institute of Technology"},{"key":"ref86","article-title":"Toward an improved model of auditory saliency","author":"Tordini","year":"2013","journal-title":"Georgia Institute of Technology"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00114"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/TBME.2016.2587382"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref91","article-title":"Cstr vctk corpus: English multi-speaker corpus for cstr voice cloning toolkit","author":"Veaux","year":"2017","journal-title":"University of Edinburgh. The Centre for Speech Technology Research (CSTR)"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413399"},{"key":"ref93","article-title":"Look&listen: Multi-modal correlation learning for active speaker detection and speech enhancement","author":"Xiong","year":"2022","journal-title":"ArXiv, abs\/2203.02216"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2008.2007344"},{"key":"ref95","first-page":"1","article-title":"Multi-task learning for audio-visual ac-tive speaker detection","author":"Zhang","year":"2019","journal-title":"The ActivityNet Large-Scale Activity Recognition Challenge"}],"event":{"name":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","start":{"date-parts":[[2023,6,17]]},"location":"Vancouver, BC, Canada","end":{"date-parts":[[2023,6,24]]}},"container-title":["2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10203037\/10203050\/10203787.pdf?arnumber=10203787","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T17:19:37Z","timestamp":1709313577000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10203787\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6]]},"references-count":95,"URL":"https:\/\/doi.org\/10.1109\/cvpr52729.2023.01409","relation":{},"subject":[],"published":{"date-parts":[[2023,6]]}}}