{"id":"https://openalex.org/W4391940860","doi":"https://doi.org/10.48550/arxiv.2402.10533","title":"APCodec: A Neural Audio Codec with Parallel Amplitude and Phase Spectrum\n Encoding and Decoding","display_name":"APCodec: A Neural Audio Codec with Parallel Amplitude and Phase Spectrum\n Encoding and Decoding","publication_year":2024,"publication_date":"2024-02-16","ids":{"openalex":"https://openalex.org/W4391940860","doi":"https://doi.org/10.48550/arxiv.2402.10533"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.10533","pdf_url":"http://arxiv.org/pdf/2402.10533","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2402.10533","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5014746276","display_name":"Yang Ai","orcid":"https://orcid.org/0009-0006-0157-4980"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ai, Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031429152","display_name":"Xiao-Hang Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Xiao-Hang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072371384","display_name":"Ye-Xin Lu","orcid":"https://orcid.org/0009-0009-8026-0702"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Ye-Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067982618","display_name":"Hui-Peng Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Hui-Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5059767940","display_name":"Zhen-Hua Ling","orcid":"https://orcid.org/0000-0001-7853-5273"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ling, Zhen-Hua","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":84},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9941,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9941,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9787,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11034","display_name":"Digital Filter Design and Implementation","score":0.9686,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.77401173}],"concepts":[{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.77401173},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.738006},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.69492555},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.64936954},{"id":"https://openalex.org/C177067256","wikidata":"https://www.wikidata.org/wiki/Q4676210","display_name":"Adaptive Multi-Rate audio codec","level":4,"score":0.5693495},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.56409216},{"id":"https://openalex.org/C156778621","wikidata":"https://www.wikidata.org/wiki/Q1365748","display_name":"Spectrum (functional analysis)","level":2,"score":0.47611743},{"id":"https://openalex.org/C180205008","wikidata":"https://www.wikidata.org/wiki/Q159190","display_name":"Amplitude","level":2,"score":0.42771858},{"id":"https://openalex.org/C44280652","wikidata":"https://www.wikidata.org/wiki/Q104837","display_name":"Phase (matter)","level":2,"score":0.42439264},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.20907137},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.14871383},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.1258448},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.118676245},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.07784492},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.10533","pdf_url":"http://arxiv.org/pdf/2402.10533","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.10533","pdf_url":"http://arxiv.org/pdf/2402.10533","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4252424250","https://openalex.org/W4241950017","https://openalex.org/W2949131019","https://openalex.org/W2382399415","https://openalex.org/W2370747337","https://openalex.org/W2369511196","https://openalex.org/W2163719598","https://openalex.org/W2157819213","https://openalex.org/W2133351881","https://openalex.org/W1890500690"],"abstract_inverted_index":{"This":[0,107],"paper":[1],"introduces":[2],"a":[3,61,74,98,103,182],"novel":[4],"neural":[5],"audio":[6,35,48,88,120,144,205,234],"codec":[7],"targeting":[8],"high":[9],"waveform":[10,27,130,146],"sampling":[11],"rates":[12],"and":[13,26,37,44,60,90,122,127,152,175,236,249],"low":[14],"bitrates":[15],"named":[16],"APCodec,":[17,180],"which":[18],"seamlessly":[19],"integrates":[20],"the":[21,32,42,64,70,78,87,113,116,119,128,140,164,189,217,220,224],"strengths":[22],"of":[23,34,57,142,191,208,219],"parametric":[24,49,52],"codecs":[25],"codecs.":[28,53],"The":[29,84],"APCodec":[30,200,229],"revolutionizes":[31],"process":[33],"encoding":[36],"decoding":[38],"by":[39,73,112,133],"concurrently":[40],"handling":[41],"amplitude":[43,89,121],"phase":[45,91,123],"spectra":[46,92,124],"as":[47,69,245],"characteristics":[50],"like":[51,145],"It":[54],"is":[55,109,131],"composed":[56],"an":[58],"encoder":[59,85],"decoder":[62,117],"with":[63,212],"modified":[65],"ConvNeXt":[66],"v2":[67],"network":[68,155],"backbone,":[71],"connected":[72],"quantizer":[75],"based":[76,157],"on":[77],"residual":[79],"vector":[80],"quantization":[81,150],"(RVQ)":[82],"mechanism.":[83],"compresses":[86],"in":[93,125,179,216],"parallel,":[94,126],"amalgamating":[95],"them":[96],"into":[97],"continuous":[99],"latent":[100],"code":[101,108],"at":[102,206],"reduced":[104],"temporal":[105],"resolution.":[106],"subsequently":[110],"quantized":[111],"quantizer.":[114],"Ultimately,":[115],"reconstructs":[118],"decoded":[129,143,192,221,233],"obtained":[132],"inverse":[134],"short-time":[135],"Fourier":[136],"transform.":[137],"To":[138,166],"ensure":[139],"fidelity":[141],"codecs,":[147,243],"spectral-level":[148],"loss,":[149,151],"generative":[153],"adversarial":[154],"(GAN)":[156],"loss":[158],"are":[159],"collectively":[160],"employed":[161],"for":[162],"training":[163,185],"APCodec.":[165],"support":[167],"low-latency":[168],"streamable":[169],"inference,":[170],"we":[171],"employ":[172],"feed-forward":[173],"layers":[174,178],"causal":[176],"convolutional":[177],"incorporating":[181],"knowledge":[183],"distillation":[184],"strategy":[186],"to":[187,241],"enhance":[188],"quality":[190,218,235],"audio.":[193,222],"Experimental":[194],"results":[195],"confirm":[196],"that":[197],"our":[198,227],"proposed":[199,228],"can":[201],"encode":[202],"48":[203],"kHz":[204],"bitrate":[207],"just":[209],"6":[210],"kbps,":[211],"no":[213],"significant":[214],"degradation":[215],"At":[223],"same":[225],"bitrate,":[226],"also":[230],"demonstrates":[231],"superior":[232],"faster":[237],"generation":[238],"speed":[239],"compared":[240],"well-known":[242],"such":[244],"SoundStream,":[246],"Encodec,":[247],"HiFi-Codec":[248],"AudioDec.":[250]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4391940860","counts_by_year":[],"updated_date":"2024-12-13T07:15:36.763426","created_date":"2024-02-20"}