{"id":"https://openalex.org/W4319862658","doi":"https://doi.org/10.1109/slt54892.2023.10022897","title":"Combining Contrastive and Non-Contrastive Losses for Fine-Tuning Pretrained Models in Speech Analysis","display_name":"Combining Contrastive and Non-Contrastive Losses for Fine-Tuning Pretrained Models in Speech Analysis","publication_year":2023,"publication_date":"2023-01-09","ids":{"openalex":"https://openalex.org/W4319862658","doi":"https://doi.org/10.1109/slt54892.2023.10022897"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt54892.2023.10022897","pdf_url":null,"source":{"id":"https://openalex.org/S4363605953","display_name":"2022 IEEE Spoken Language Technology Workshop (SLT)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2211.01964","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5078880407","display_name":"Florian Lux","orcid":"https://orcid.org/0000-0003-4325-5129"},"institutions":[{"id":"https://openalex.org/I100066346","display_name":"University of Stuttgart","ror":"https://ror.org/04vnq7t77","country_code":"DE","type":"education","lineage":["https://openalex.org/I100066346"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Florian Lux","raw_affiliation_strings":["University of Stuttgart, Institute for Natural Language Processing, Germany"],"affiliations":[{"raw_affiliation_string":"University of Stuttgart, Institute for Natural Language Processing, Germany","institution_ids":["https://openalex.org/I100066346"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100757473","display_name":"Ching\u2010Yi Chen","orcid":"https://orcid.org/0000-0002-5776-084X"},"institutions":[{"id":"https://openalex.org/I100066346","display_name":"University of Stuttgart","ror":"https://ror.org/04vnq7t77","country_code":"DE","type":"education","lineage":["https://openalex.org/I100066346"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Ching-Yi Chen","raw_affiliation_strings":["University of Stuttgart, Institute for Natural Language Processing, Germany"],"affiliations":[{"raw_affiliation_string":"University of Stuttgart, Institute for Natural Language Processing, Germany","institution_ids":["https://openalex.org/I100066346"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5020700841","display_name":"Ngoc Thang Vu","orcid":"https://orcid.org/0000-0001-7893-9147"},"institutions":[{"id":"https://openalex.org/I100066346","display_name":"University of Stuttgart","ror":"https://ror.org/04vnq7t77","country_code":"DE","type":"education","lineage":["https://openalex.org/I100066346"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Ngoc Thang Vu","raw_affiliation_strings":["University of Stuttgart, Institute for Natural Language Processing, Germany"],"affiliations":[{"raw_affiliation_string":"University of Stuttgart, Institute for Natural Language Processing, Germany","institution_ids":["https://openalex.org/I100066346"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.412,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.558059,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":67,"max":78},"biblio":{"volume":null,"issue":null,"first_page":"876","last_page":"883"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9999,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9999,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9982,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9978,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/paralanguage","display_name":"Paralanguage","score":0.7927856},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.6949498},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5119089}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8700247},{"id":"https://openalex.org/C133378560","wikidata":"https://www.wikidata.org/wiki/Q1753225","display_name":"Paralanguage","level":2,"score":0.7927856},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.6949498},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6679069},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.55856377},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5567037},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5119089},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.48277178},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.4614237},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4395258},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.43641937},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt54892.2023.10022897","pdf_url":null,"source":{"id":"https://openalex.org/S4363605953","display_name":"2022 IEEE Spoken Language Technology Workshop (SLT)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2211.01964","pdf_url":"http://arxiv.org/pdf/2211.01964","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2211.01964","pdf_url":"http://arxiv.org/pdf/2211.01964","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"score":0.75,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":29,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1962925405","https://openalex.org/W1983645263","https://openalex.org/W2146334809","https://openalex.org/W2171590421","https://openalex.org/W2187089797","https://openalex.org/W2530305026","https://openalex.org/W2726515241","https://openalex.org/W2754547322","https://openalex.org/W2807627734","https://openalex.org/W2899436735","https://openalex.org/W2962788625","https://openalex.org/W2963775347","https://openalex.org/W2964105864","https://openalex.org/W2972362771","https://openalex.org/W2982223350","https://openalex.org/W3015265920","https://openalex.org/W3024869864","https://openalex.org/W3041561163","https://openalex.org/W3089942512","https://openalex.org/W3112034174","https://openalex.org/W3113594615","https://openalex.org/W3181310845","https://openalex.org/W3197580070","https://openalex.org/W3197642003","https://openalex.org/W3198275944","https://openalex.org/W3209059054","https://openalex.org/W4210694145","https://openalex.org/W4297808394"],"related_works":["https://openalex.org/W4250647969","https://openalex.org/W3200958703","https://openalex.org/W3166813893","https://openalex.org/W3108667266","https://openalex.org/W2910013580","https://openalex.org/W2778981579","https://openalex.org/W2391900574","https://openalex.org/W2376619307","https://openalex.org/W2064370490","https://openalex.org/W1990078780"],"abstract_inverted_index":{"Embedding":[0],"paralinguistic":[1],"properties":[2,55],"is":[3,29,47],"a":[4,11,32,51,71,97,109,139],"challenging":[5],"task":[6],"as":[7,21],"there":[8],"are":[9,131],"only":[10],"few":[12],"hours":[13],"of":[14,41,111],"training":[15],"data":[16],"available":[17],"for":[18,119],"domains":[19],"such":[20],"emotional":[22],"speech.":[23,43],"One":[24],"solution":[25],"to":[26,30,50,75,88,96,102,116],"this":[27,67],"problem":[28],"pretrain":[31],"general":[33],"self-supervised":[34],"speech":[35],"representation":[36],"model":[37,46],"on":[38,134,141],"large":[39],"amounts":[40],"unlabeled":[42],"This":[44],"pretrained":[45],"then":[48,83],"finetuned":[49,132],"specific":[52],"task.":[53,99],"Paralinguistic":[54],"however":[56],"have":[57],"notoriously":[58],"high":[59],"class":[60,105,120],"variance,":[61],"making":[62],"the":[63,80,90,93,104],"finetuning":[64],"ineffective.":[65],"In":[66,100],"work,":[68],"we":[69,78,84,107],"propose":[70],"two":[72],"step":[73],"approach":[74,126],"this.":[76],"First":[77],"improve":[79,103],"embedding":[81,94],"space,":[82],"train":[85],"an":[86],"adapter":[87],"bridge":[89],"gap":[91],"from":[92],"space":[95],"classification":[98],"order":[101],"invariance":[106],"use":[108],"combination":[110],"contrastive":[112],"and":[113,137],"non-contrastive":[114],"losses":[115],"explicitly":[117],"optimize":[118],"invariant,":[121],"yet":[122],"discriminative":[123],"features.":[124],"Our":[125],"consistently":[127],"outperforms":[128],"baselines":[129],"that":[130],"end-to-end":[133],"multiple":[135],"tasks":[136],"surpasses":[138],"benchmark":[140],"state-of-the-art":[142],"emotion":[143],"classification.":[144]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4319862658","counts_by_year":[{"year":2023,"cited_by_count":1}],"updated_date":"2025-01-01T20:37:18.045183","created_date":"2023-02-11"}