{"id":"https://openalex.org/W4320342491","doi":"https://doi.org/10.48550/arxiv.2302.04456","title":"ERNIE-Music: Text-to-Waveform Music Generation with Diffusion Models","display_name":"ERNIE-Music: Text-to-Waveform Music Generation with Diffusion Models","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4320342491","doi":"https://doi.org/10.48550/arxiv.2302.04456"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2302.04456","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2302.04456","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006952581","display_name":"Pengfei Zhu","orcid":"https://orcid.org/0000-0002-4310-9140"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Pengfei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100545207","display_name":"Chao Pang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pang, Chao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071055669","display_name":"Shuohuan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shuohuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072082849","display_name":"Yekun Chai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chai, Yekun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100355572","display_name":"Yu Sun","orcid":"https://orcid.org/0000-0003-2306-7200"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100771541","display_name":"Hao Tian","orcid":"https://orcid.org/0000-0001-8219-9743"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5112985399","display_name":"Hua Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Hua","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.947487,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":92},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9978,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9978,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9335,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9196,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance","score":0.7362145}],"concepts":[{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.7362145},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6996614},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.67456216},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.4955181},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.42564696},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37721688},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.35999298},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3328913},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09106496},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C554190296","wikidata":"https://www.wikidata.org/wiki/Q47528","display_name":"Radar","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2302.04456","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2302.04456","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2302.04456","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"display_name":"Peace, justice, and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.42}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4367555392","https://openalex.org/W3040712279","https://openalex.org/W2883092465","https://openalex.org/W2374664672","https://openalex.org/W2364769705","https://openalex.org/W2176409448","https://openalex.org/W2129841057","https://openalex.org/W2114441484","https://openalex.org/W2056136368","https://openalex.org/W1974895211"],"abstract_inverted_index":{"In":[0,35],"recent":[1],"years,":[2],"the":[3,20,47,57,66,78,83,88,97,122,146,168,185],"burgeoning":[4],"interest":[5],"in":[6,14,46,153,163,174,184,195],"diffusion":[7,60,84],"models":[8],"has":[9],"led":[10],"to":[11,37,76,120],"significant":[12],"advances":[13],"image":[15],"and":[16,135,200],"speech":[17],"generation.":[18,176],"Nevertheless,":[19],"direct":[21],"synthesis":[22],"of":[23,49,59,69,90,99,124,141,149,155,167,171,197],"music":[24,52,133,183],"waveforms":[25],"from":[26],"unrestricted":[27],"textual":[28,71,137],"prompts":[29,72],"remains":[30],"a":[31,43,50,100,106,114,164,192],"relatively":[32],"underexplored":[33],"domain.":[34],"response":[36],"this":[38,40,142],"lacuna,":[39],"paper":[41],"introduces":[42],"pioneering":[44],"contribution":[45],"form":[48],"text-to-waveform":[51],"generation":[53,80],"model,":[54],"underpinned":[55],"by":[56,102,109,191],"utilization":[58],"models.":[61],"Our":[62],"methodology":[63],"hinges":[64],"on":[65],"innovative":[67],"incorporation":[68],"free-form":[70],"as":[73],"conditional":[74],"factors":[75],"guide":[77],"waveform":[79,186],"process":[81],"within":[82],"model":[85,152,173],"framework.":[86],"Addressing":[87],"challenge":[89],"limited":[91],"text-music":[92,157,201],"parallel":[93],"data,":[94],"we":[95],"undertake":[96],"creation":[98],"dataset":[101],"harnessing":[103],"web":[104],"resources,":[105],"task":[107],"facilitated":[108],"weak":[110],"supervision":[111],"techniques.":[112],"Furthermore,":[113],"rigorous":[115],"empirical":[116],"inquiry":[117],"is":[118],"undertaken":[119],"contrast":[121],"efficacy":[123],"two":[125],"distinct":[126],"prompt":[127],"formats":[128],"for":[129],"text":[130],"conditioning,":[131],"namely,":[132],"tags":[134],"unconstrained":[136],"descriptions.":[138],"The":[139],"outcomes":[140],"comparative":[143],"analysis":[144],"affirm":[145],"superior":[147],"performance":[148],"our":[150,160,172,181],"proposed":[151],"terms":[154,196],"enhancing":[156],"relevance.":[158,202],"Finally,":[159],"work":[161],"culminates":[162],"demonstrative":[165],"exhibition":[166],"excellent":[169],"capabilities":[170],"text-to-music":[175],"We":[177],"further":[178],"demonstrate":[179],"that":[180],"generated":[182],"domain":[187],"outperforms":[188],"previous":[189],"works":[190],"large":[193],"margin":[194],"diversity,":[198],"quality,":[199]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4320342491","counts_by_year":[{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":2}],"updated_date":"2025-01-24T03:48:04.076446","created_date":"2023-02-13"}