{"id":"https://openalex.org/W4403719532","doi":"https://doi.org/10.48550/arxiv.2409.11656","title":"VL-Reader: Vision and Language Reconstructor is an Effective Scene Text\n Recognizer","display_name":"VL-Reader: Vision and Language Reconstructor is an Effective Scene Text\n Recognizer","publication_year":2024,"publication_date":"2024-09-17","ids":{"openalex":"https://openalex.org/W4403719532","doi":"https://doi.org/10.48550/arxiv.2409.11656"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.11656","pdf_url":"http://arxiv.org/pdf/2409.11656","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2409.11656","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5041568693","display_name":"Humen Zhong","orcid":"https://orcid.org/0009-0002-8676-0811"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Humen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101406829","display_name":"Zhibo Yang","orcid":"https://orcid.org/0000-0003-2343-7750"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zhibo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112985512","display_name":"Zhaohai Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Zhaohai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100396141","display_name":"Peng Wang","orcid":"https://orcid.org/0000-0003-3098-009X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100633673","display_name":"Jun Tang","orcid":"https://orcid.org/0000-0002-5470-8798"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Jun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056643192","display_name":"Wenqing Cheng","orcid":"https://orcid.org/0000-0001-7840-1195"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Wenqing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100313026","display_name":"Cong Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Cong","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":77},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.4953,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.4953,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.65631413},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.4597405},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3966683},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.348964},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3332828}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.11656","pdf_url":"http://arxiv.org/pdf/2409.11656","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.11656","pdf_url":"http://arxiv.org/pdf/2409.11656","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4396701345","https://openalex.org/W4391913857","https://openalex.org/W4391375266","https://openalex.org/W3204019825","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2376932109","https://openalex.org/W2358668433","https://openalex.org/W2001405890"],"abstract_inverted_index":{"Text":[0],"recognition":[1,88],"is":[2],"an":[3,35,84,183,191,225],"inherent":[4],"integration":[5],"of":[6,38,94,150,194],"vision":[7,53,79,103,218],"and":[8,17,42,54,56,67,80,104,126,143,167,219],"language,":[9,81],"encompassing":[10],"the":[11,18,22,40,50,60,95,99,107,159,172,175,201],"visual":[12,41,125,166],"texture":[13],"in":[14,59,77,98,171],"stroke":[15],"patterns":[16],"semantic":[19,43],"context":[20,142],"among":[21],"character":[23],"sequences.":[24],"Towards":[25],"advanced":[26],"text":[27,87,168,228],"recognition,":[28],"there":[29],"are":[30],"three":[31],"key":[32],"challenges:":[33],"(1)":[34],"encoder":[36],"capable":[37],"representing":[39],"distributions;":[44],"(2)":[45],"a":[46,73,114,132],"decoder":[47],"that":[48,217],"ensures":[49],"alignment":[51],"between":[52,102],"semantics;":[55],"(3)":[57],"consistency":[58,153],"framework":[61],"during":[62],"pre-training,":[63],"if":[64],"it":[65],"exists,":[66],"fine-tuning.":[68,157],"Inspired":[69],"by":[70,203],"masked":[71,140,165,187],"autoencoding,":[72],"successful":[74],"pre-training":[75,155,160],"strategy":[76],"both":[78,164],"we":[82,111,130],"propose":[83],"innovative":[85],"scene":[86,227],"approach,":[89],"named":[90],"VL-Reader.":[91],"The":[92,148,205,214],"novelty":[93],"VL-Reader":[96,151,162],"lies":[97],"pervasive":[100],"interplay":[101],"language":[105,220],"throughout":[106],"entire":[108],"process.":[109],"Concretely,":[110],"first":[112],"introduce":[113],"Masked":[115,133],"Visual-Linguistic":[116,134],"Reconstruction":[117],"(MVLR)":[118],"objective,":[119],"which":[120],"aims":[121],"at":[122],"simultaneously":[123],"modeling":[124],"linguistic":[127],"information.":[128],"Then,":[129],"design":[131],"Decoder":[135],"(MVLD)":[136],"to":[137,156,178],"further":[138],"leverage":[139],"vision-language":[141],"achieve":[144],"bi-modal":[145],"feature":[146],"interaction.":[147],"architecture":[149],"maintains":[152],"from":[154,182],"In":[158],"stage,":[161,174],"reconstructs":[163],"tokens,":[169],"while":[170],"fine-tuning":[173],"network":[176],"degrades":[177],"reconstruct":[179],"all":[180],"characters":[181],"image":[184],"without":[185],"any":[186],"regions.":[188],"VL-reader":[189],"achieves":[190],"average":[192],"accuracy":[193],"97.1%":[195],"on":[196,211],"six":[197],"typical":[198],"datasets,":[199],"surpassing":[200],"SOTA":[202],"1.1%.":[204],"improvement":[206],"was":[207],"even":[208],"more":[209],"significant":[210],"challenging":[212],"datasets.":[213],"results":[215],"demonstrate":[216],"reconstructor":[221],"can":[222],"serve":[223],"as":[224],"effective":[226],"recognizer.":[229]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4403719532","counts_by_year":[],"updated_date":"2025-04-19T02:56:46.352774","created_date":"2024-10-25"}