{"id":"https://openalex.org/W2937090315","doi":"https://doi.org/10.1109/icassp.2019.8683131","title":"A Factorial Deep Markov Model for Unsupervised Disentangled Representation Learning from Speech","display_name":"A Factorial Deep Markov Model for Unsupervised Disentangled Representation Learning from Speech","publication_year":2019,"publication_date":"2019-04-17","ids":{"openalex":"https://openalex.org/W2937090315","doi":"https://doi.org/10.1109/icassp.2019.8683131","mag":"2937090315"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2019.8683131","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5008673783","display_name":"Sameer Khurana","orcid":null},"institutions":[],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sameer Khurana","raw_affiliation_strings":["MIT Computer Science & Artificial Intelligence Laboratory, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"MIT Computer Science & Artificial Intelligence Laboratory, Cambridge, MA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005443526","display_name":"Shafiq Joty","orcid":"https://orcid.org/0000-0002-9222-2641"},"institutions":[],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Shafiq Rayhan Joty","raw_affiliation_strings":["School of Computer Science and Engineering, NTU, Singapore"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, NTU, Singapore","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100693435","display_name":"Ahmed Ali","orcid":"https://orcid.org/0000-0002-9186-7544"},"institutions":[{"id":"https://openalex.org/I1301390666","display_name":"Qatar Airways (Qatar)","ror":"https://ror.org/01hx00y13","country_code":"QA","type":"company","lineage":["https://openalex.org/I1301390666"]}],"countries":["QA"],"is_corresponding":false,"raw_author_name":"Ahmed Ali","raw_affiliation_strings":["Qatar Computing Research Institute, HBKU, Doha, Qatar"],"affiliations":[{"raw_affiliation_string":"Qatar Computing Research Institute, HBKU, Doha, Qatar","institution_ids":["https://openalex.org/I1301390666"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5112758056","display_name":"James Glass","orcid":null},"institutions":[],"countries":["US"],"is_corresponding":false,"raw_author_name":"James Glass","raw_affiliation_strings":["MIT Computer Science & Artificial Intelligence Laboratory, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"MIT Computer Science & Artificial Intelligence Laboratory, Cambridge, MA, USA","institution_ids":[]}]}],"institution_assertions":[],"countries_distinct_count":3,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.859,"has_fulltext":true,"fulltext_origin":"ngrams","cited_by_count":15,"citation_normalized_percentile":{"value":0.881774,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":90},"biblio":{"volume":null,"issue":null,"first_page":"6540","last_page":"6544"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9976,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9958,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation","score":0.58261186},{"id":"https://openalex.org/keywords/factorial","display_name":"Factorial","score":0.51056635},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature Learning","score":0.47602776},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.43285617}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7511313},{"id":"https://openalex.org/C51167844","wikidata":"https://www.wikidata.org/wiki/Q4422623","display_name":"Latent variable","level":2,"score":0.70507735},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.6068157},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.59201604},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.58261186},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.57239765},{"id":"https://openalex.org/C183763347","wikidata":"https://www.wikidata.org/wiki/Q120976","display_name":"Factorial","level":2,"score":0.51056635},{"id":"https://openalex.org/C163836022","wikidata":"https://www.wikidata.org/wiki/Q6771326","display_name":"Markov model","level":3,"score":0.48725358},{"id":"https://openalex.org/C65965080","wikidata":"https://www.wikidata.org/wiki/Q1806885","display_name":"Latent variable model","level":3,"score":0.477153},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.47602776},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.43285617},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38166106},{"id":"https://openalex.org/C98763669","wikidata":"https://www.wikidata.org/wiki/Q176645","display_name":"Markov chain","level":2,"score":0.355472},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.34069255},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.14825058},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2019.8683131","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities","score":0.51}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":25,"referenced_works":["https://openalex.org/W111477576","https://openalex.org/W1524333225","https://openalex.org/W1635512741","https://openalex.org/W1959608418","https://openalex.org/W2005708641","https://openalex.org/W2055408826","https://openalex.org/W2100768664","https://openalex.org/W2134827050","https://openalex.org/W2225156818","https://openalex.org/W2293634267","https://openalex.org/W2572702041","https://openalex.org/W2584414011","https://openalex.org/W2748528945","https://openalex.org/W2758785877","https://openalex.org/W2791874451","https://openalex.org/W2793111190","https://openalex.org/W2962922562","https://openalex.org/W2963134917","https://openalex.org/W2963417023","https://openalex.org/W2963499843","https://openalex.org/W2963618559","https://openalex.org/W2964232608","https://openalex.org/W3101380508","https://openalex.org/W3211294292","https://openalex.org/W4297730691"],"related_works":["https://openalex.org/W62001224","https://openalex.org/W4243467573","https://openalex.org/W3122667150","https://openalex.org/W3032390039","https://openalex.org/W2461917396","https://openalex.org/W2174922504","https://openalex.org/W2037497866","https://openalex.org/W1966667550","https://openalex.org/W1584341211","https://openalex.org/W1502435251"],"abstract_inverted_index":{"We":[0,27],"present":[1],"the":[2,37,54,70],"Factorial":[3],"Deep":[4],"Markov":[5],"Model":[6],"(FDMM)":[7],"for":[8],"representation":[9],"learning":[10],"of":[11,73],"speech.":[12],"The":[13],"FDMM":[14,55],"learns":[15],"disentangled,":[16],"interpretable":[17],"and":[18,31,64],"lower":[19],"dimensional":[20],"latent":[21,33],"representations":[22,51],"from":[23],"speech":[24,43],"without":[25],"supervision.":[26],"use":[28],"a":[29,42,57,74,79],"static":[30],"dynamic":[32],"variable":[34],"to":[35],"exploit":[36],"fact":[38],"that":[39],"information":[40],"in":[41,78],"signal":[44],"evolves":[45],"at":[46],"different":[47],"time":[48],"scales.":[49],"Latent":[50],"learned":[52],"by":[53],"outperform":[56],"baseline":[58],"i-vector":[59],"system":[60,77],"on":[61],"speaker":[62],"verification":[63],"dialect":[65],"identification":[66],"while":[67],"also":[68],"reducing":[69],"error":[71],"rate":[72],"phone":[75],"recognition":[76],"domain":[80],"mismatch":[81],"scenario.":[82]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W2937090315","counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":7},{"year":2019,"cited_by_count":1}],"updated_date":"2025-01-17T17:08:47.966669","created_date":"2019-04-25"}