{"id":"https://openalex.org/W4404345153","doi":"https://doi.org/10.48550/arxiv.2411.00464","title":"MDCTCodec: A Lightweight MDCT-based Neural Audio Codec towards High\n Sampling Rate and Low Bitrate Scenarios","display_name":"MDCTCodec: A Lightweight MDCT-based Neural Audio Codec towards High\n Sampling Rate and Low Bitrate Scenarios","publication_year":2024,"publication_date":"2024-11-01","ids":{"openalex":"https://openalex.org/W4404345153","doi":"https://doi.org/10.48550/arxiv.2411.00464"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.00464","pdf_url":"http://arxiv.org/pdf/2411.00464","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2411.00464","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5031429152","display_name":"Xiao-Hang Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Xiao-Hang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045907056","display_name":"Yang Ai","orcid":"https://orcid.org/0000-0001-6668-022X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ai, Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111310848","display_name":"Rui-Chen Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Rui-Chen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067982618","display_name":"Hui-Peng Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Hui-Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072371384","display_name":"Ye-Xin Lu","orcid":"https://orcid.org/0009-0009-8026-0702"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Ye-Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5059767940","display_name":"Zhen-Hua Ling","orcid":"https://orcid.org/0000-0001-7853-5273"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ling, Zhen-Hua","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":84},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9921,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9921,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9782,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.973,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.8574946}],"concepts":[{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.8574946},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5752599},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.43417054},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32045817},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.1659038},{"id":"https://openalex.org/C94915269","wikidata":"https://www.wikidata.org/wiki/Q1834857","display_name":"Detector","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.00464","pdf_url":"http://arxiv.org/pdf/2411.00464","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.00464","pdf_url":"http://arxiv.org/pdf/2411.00464","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W4301184752","https://openalex.org/W3161919736","https://openalex.org/W2964213236","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2387018512","https://openalex.org/W2288771647","https://openalex.org/W2163719598","https://openalex.org/W2107680156"],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3],"propose":[4],"MDCTCodec,":[5],"an":[6],"efficient":[7],"lightweight":[8],"end-to-end":[9],"neural":[10],"audio":[11,28,62,107],"codec":[12],"based":[13],"on":[14,144],"the":[15,24,49,52,56,67,80,102,123,145],"modified":[16],"discrete":[17],"cosine":[18],"transform":[19],"(MDCT).":[20],"The":[21],"encoder":[22],"takes":[23],"MDCT":[25,53,84],"spectrum":[26,54,85],"of":[27,129,135,141],"as":[29],"input,":[30],"encoding":[31],"it":[32],"into":[33],"a":[34,43,70,126,132,139],"continuous":[35],"latent":[36,58],"code":[37,59],"which":[38],"is":[39,76],"then":[40],"discretized":[41],"by":[42],"residual":[44],"vector":[45],"quantizer":[46],"(RVQ).":[47],"Subsequently,":[48],"decoder":[50],"decodes":[51],"from":[55],"quantized":[57],"and":[60,99,111,114,138],"reconstructs":[61],"via":[63],"inverse":[64],"MDCT.":[65],"During":[66],"training":[68,110],"phase,":[69],"novel":[71],"multi-resolution":[72],"MDCT-based":[73],"discriminator":[74],"(MR-MDCTD)":[75],"adopted":[77],"to":[78,119],"discriminate":[79],"natural":[81],"or":[82],"decoded":[83,106],"for":[86],"adversarial":[87],"training.":[88],"Experimental":[89],"results":[90],"confirm":[91],"that,":[92],"in":[93],"scenarios":[94],"with":[95],"high":[96,105],"sampling":[97,133],"rates":[98],"low":[100],"bitrates,":[101],"MDCTCodec":[103,124],"exhibited":[104],"quality,":[108],"improved":[109],"generation":[112],"efficiency,":[113],"compact":[115],"model":[116],"size":[117],"compared":[118],"baseline":[120],"codecs.":[121],"Specifically,":[122],"achieved":[125],"ViSQOL":[127],"score":[128],"4.18":[130],"at":[131],"rate":[134],"48":[136],"kHz":[137],"bitrate":[140],"6":[142],"kbps":[143],"public":[146],"VCTK":[147],"corpus.":[148]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4404345153","counts_by_year":[],"updated_date":"2024-12-15T10:08:27.296726","created_date":"2024-11-14"}