{"id":"https://openalex.org/W4386644469","doi":"https://doi.org/10.48550/arxiv.2309.04628","title":"Leveraging Pretrained Image-text Models for Improving Audio-Visual Learning","display_name":"Leveraging Pretrained Image-text Models for Improving Audio-Visual Learning","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4386644469","doi":"https://doi.org/10.48550/arxiv.2309.04628"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2309.04628","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2309.04628","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045247662","display_name":"Saurabhchand Bhati","orcid":"https://orcid.org/0000-0001-6477-3895"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bhati, Saurabhchand","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057010207","display_name":"Jes\u00fas Villalba","orcid":"https://orcid.org/0000-0001-9459-8426"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Villalba, Jes\u00fas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069488212","display_name":"Laureano Moro-Vel\u00e1zquez","orcid":"https://orcid.org/0000-0002-3033-7005"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Moro-Velazquez, Laureano","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040324682","display_name":"Thomas Thebaud","orcid":"https://orcid.org/0000-0001-8953-7872"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Thebaud, Thomas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5050632169","display_name":"Najim Dehak","orcid":"https://orcid.org/0000-0002-4489-5753"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dehak, Najim","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":65},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9994,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9994,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9828,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9823,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.80662334},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.78519773},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6576067},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.541652},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5283248},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.52643025},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.49062648},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.43059134},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.11394349},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2309.04628","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2309.04628","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2309.04628","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.75,"display_name":"Quality education"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4390516098","https://openalex.org/W4307058054","https://openalex.org/W3022596247","https://openalex.org/W2944691285","https://openalex.org/W2601444686","https://openalex.org/W2384362569","https://openalex.org/W2349784553","https://openalex.org/W2181948922","https://openalex.org/W2142795561","https://openalex.org/W1997182898"],"abstract_inverted_index":{"Visually":[0],"grounded":[1,21,38],"speech":[2,90,163],"systems":[3,218],"learn":[4],"from":[5,24],"paired":[6],"images":[7,25],"and":[8,26,61,67,112,151,186,191],"their":[9,27],"spoken":[10],"captions.":[11],"Recently,":[12],"there":[13],"have":[14],"been":[15],"attempts":[16],"to":[17,34,56,92,131,145,157,183,221],"utilize":[18,48,62],"the":[19,42,49,64,100,117,126,132,137,162,167,176,222],"visually":[20,37],"models":[22,46,174],"trained":[23,193],"corresponding":[28],"text":[29,68,103,134,177,187],"captions,":[30],"such":[31],"as":[32,129,175],"CLIP,":[33],"improve":[35],"speech-based":[36],"models'":[39],"performance.":[40,80],"However,":[41],"majority":[43],"of":[44,95,107,120,169,208],"these":[45,108],"only":[47,206],"pretrained":[50,65,101,172],"image":[51,66,185],"encoder.":[52],"Cascaded":[53],"SpeechCLIP":[54,84,123],"attempted":[55],"generate":[57,93],"localized":[58],"word-level":[59],"information":[60,160],"both":[63],"encoders.":[69,178],"Despite":[70],"using":[71],"both,":[72],"they":[73],"noticed":[74],"a":[75,87],"substantial":[76],"drop":[77],"in":[78,202],"retrieval":[79],"We":[81,98],"proposed":[82],"Segmental":[83,122],"which":[85],"used":[86,99],"hierarchical":[88],"segmental":[89],"encoder":[91,104,135],"sequences":[94],"word-like":[96,109],"units.":[97],"CLIP":[102,133,146],"on":[105],"top":[106],"unit":[110],"representations":[111],"showed":[113],"significant":[114],"improvements":[115],"over":[116],"cascaded":[118],"variant":[119],"SpeechCLIP.":[121],"directly":[124],"learns":[125],"word":[127],"embeddings":[128,148],"input":[130],"bypassing":[136],"vocabulary":[138,147],"embeddings.":[139],"Here,":[140],"we":[141,165,198],"explore":[142,166],"mapping":[143],"audio":[144,211],"via":[149],"regularization":[150],"quantization.":[152],"As":[153],"our":[154,200],"objective":[155],"is":[156],"distill":[158],"semantic":[159],"into":[161],"encoders,":[164],"usage":[168],"large":[170],"unimodal":[171],"language":[173],"Our":[179],"method":[180],"enables":[181],"us":[182],"bridge":[184],"encoders":[188],"e.g.":[189],"DINO":[190],"RoBERTa":[192],"with":[194],"uni-modal":[195],"data.":[196],"Finally,":[197],"extend":[199],"framework":[201],"audio-only":[203,217],"settings":[204],"where":[205],"pairs":[207],"semantically":[209],"related":[210],"are":[212],"available.":[213],"Experiments":[214],"show":[215],"that":[216],"perform":[219],"close":[220],"audio-visual":[223],"system.":[224]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4386644469","counts_by_year":[],"updated_date":"2025-04-11T07:07:30.777921","created_date":"2023-09-13"}