{"id":"https://openalex.org/W4386494534","doi":"https://doi.org/10.1109/lsp.2023.3313090","title":"Cooperative Game Modeling With Weighted Token-Level Alignment for Audio-Text Retrieval","display_name":"Cooperative Game Modeling With Weighted Token-Level Alignment for Audio-Text Retrieval","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4386494534","doi":"https://doi.org/10.1109/lsp.2023.3313090"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2023.3313090","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"journal-article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5063679203","display_name":"Yifei Xin","orcid":"https://orcid.org/0000-0001-7792-6352"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yifei Xin","raw_affiliation_strings":["School of Electric and Computer Science, Peking University Shenzhen Graduate School, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"School of Electric and Computer Science, Peking University Shenzhen Graduate School, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100417799","display_name":"Baojun Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Baojun Wang","raw_affiliation_strings":["Noah's Ark Lab, Huawei, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Noah's Ark Lab, Huawei, Shenzhen, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5046228314","display_name":"Lifeng Shang","orcid":null},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lifeng Shang","raw_affiliation_strings":["Noah's Ark Lab, Huawei, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Noah's Ark Lab, Huawei, Shenzhen, China","institution_ids":["https://openalex.org/I2250955327"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.884,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.601087,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":87,"max":90},"biblio":{"volume":"30","issue":null,"first_page":"1317","last_page":"1321"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9971,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9941,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.4987042}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8243839},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.73751515},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.55688864},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5303372},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5075642},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.4987042},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.21434769},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2023.3313090","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.74,"display_name":"Quality education","id":"https://metadata.un.org/sdg/4"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":25,"referenced_works":["https://openalex.org/W1984432346","https://openalex.org/W2009662043","https://openalex.org/W2091836314","https://openalex.org/W2213630128","https://openalex.org/W2268194897","https://openalex.org/W2890358045","https://openalex.org/W2896457183","https://openalex.org/W3005680577","https://openalex.org/W3008569663","https://openalex.org/W3015591594","https://openalex.org/W3021397474","https://openalex.org/W3094550259","https://openalex.org/W3165431782","https://openalex.org/W3204267711","https://openalex.org/W4210913346","https://openalex.org/W4221146248","https://openalex.org/W4221157007","https://openalex.org/W4224933373","https://openalex.org/W4312530435","https://openalex.org/W4372260330","https://openalex.org/W4372260340","https://openalex.org/W4385764570","https://openalex.org/W4385822467","https://openalex.org/W4385823504","https://openalex.org/W4386076265"],"related_works":["https://openalex.org/W4388335561","https://openalex.org/W4385572700","https://openalex.org/W4385009901","https://openalex.org/W4307309205","https://openalex.org/W4288261899","https://openalex.org/W3204019825","https://openalex.org/W3125011624","https://openalex.org/W2997152889","https://openalex.org/W2970530566","https://openalex.org/W2967478618"],"abstract_inverted_index":{"Previous":[0],"audio-text":[1,56,74,119],"retrieval":[2,166],"(ATR)":[3],"methods":[4],"primarily":[5],"concentrate":[6],"on":[7,158],"constructing":[8],"contrastive":[9,120],"pairs":[10],"between":[11,55,99,144],"entire":[12],"audio":[13,80,100],"clips":[14],"and":[15,82,87,102,127,139],"full":[16],"caption":[17],"sentences,":[18],"while":[19],"neglecting":[20],"fine-grained":[21,40,52],"cross-modal":[22],"relationships.":[23],"In":[24],"this":[25],"paper,":[26],"we":[27,58,78,129],"first":[28],"introduce":[29],"a":[30,64,89,131,172],"weighted":[31],"token-level":[32],"alignment":[33],"(WTA)":[34],"module":[35,134],"for":[36],"ATR":[37,62,156],"to":[38,45,60,68,95,115,123,135],"learn":[39],"semantic":[41,75],"interactions.":[42,76],"Besides,":[43],"due":[44],"the":[46,51,71,117,137,141,145,155,165],"unavailability":[47],"of":[48],"manually":[49],"labeling":[50],"sequential":[53],"correspondence":[54,98],"pairs,":[57],"attempt":[59],"model":[61],"as":[63,85,110],"cooperative":[65],"game":[66,90],"process":[67],"flexibly":[69],"handle":[70],"uncertainty":[72],"during":[73],"Specifically,":[77],"treat":[79],"frames":[81,101],"text":[83,103],"words":[84],"players":[86],"present":[88],"theoretic":[91],"interaction":[92,142],"(GTI)":[93],"method":[94],"assess":[96],"potential":[97],"words,":[104],"which":[105],"can":[106],"also":[107],"be":[108],"seen":[109],"an":[111],"additional":[112],"learning":[113],"signal":[114],"improve":[116],"pure":[118],"learning.":[121],"Furthermore,":[122],"implement":[124],"multi-level":[125],"WTA":[126,152],"GTI,":[128,164],"develop":[130],"token":[132],"cluster":[133,136],"frames/words":[138],"calculate":[140],"scores":[143],"clustered":[146],"tokens.":[147],"Experiments":[148],"show":[149],"that":[150],"our":[151,163],"significantly":[153],"improves":[154],"performance":[157,167],"multiple":[159],"datasets.":[160],"By":[161],"combining":[162],"is":[168],"further":[169],"boosted":[170],"by":[171],"large":[173],"margin.":[174]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4386494534","counts_by_year":[{"year":2024,"cited_by_count":4}],"updated_date":"2025-01-08T15:46:38.577318","created_date":"2023-09-07"}