{"id":"https://openalex.org/W4394736689","doi":"https://doi.org/10.48550/arxiv.2404.06918","title":"HRVDA: High-Resolution Visual Document Assistant","display_name":"HRVDA: High-Resolution Visual Document Assistant","publication_year":2024,"publication_date":"2024-04-10","ids":{"openalex":"https://openalex.org/W4394736689","doi":"https://doi.org/10.48550/arxiv.2404.06918"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.06918","pdf_url":"http://arxiv.org/pdf/2404.06918","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2404.06918","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061846588","display_name":"Chaohu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Chaohu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022465137","display_name":"Kun Yin","orcid":"https://orcid.org/0000-0001-6300-6985"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Kun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024986567","display_name":"Haoyu Cao","orcid":"https://orcid.org/0000-0002-3789-9705"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Haoyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075649586","display_name":"Xinghua Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Xinghua","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100354056","display_name":"Xin Li","orcid":"https://orcid.org/0000-0003-2999-9818"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084063805","display_name":"Yinsong Liu","orcid":"https://orcid.org/0000-0002-0096-3662"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yinsong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016912950","display_name":"Deqiang Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Deqiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100388508","display_name":"Xing Sun","orcid":"https://orcid.org/0000-0002-7683-4517"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Xing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101859765","display_name":"Linli Xu","orcid":"https://orcid.org/0000-0001-6004-4425"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Linli","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":81},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9717,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9717,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9667,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9195,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C138268822","wikidata":"https://www.wikidata.org/wiki/Q1051925","display_name":"Resolution (logic)","level":2,"score":0.5699637},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.54980654},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.39423582},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.3486419},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.34431985},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34092692}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.06918","pdf_url":"http://arxiv.org/pdf/2404.06918","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.06918","pdf_url":"http://arxiv.org/pdf/2404.06918","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3116076068","https://openalex.org/W2951359407","https://openalex.org/W2775347418","https://openalex.org/W2772917594","https://openalex.org/W2755342338","https://openalex.org/W2229312674","https://openalex.org/W2166024367","https://openalex.org/W2079911747","https://openalex.org/W2058170566","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Leveraging":[0],"vast":[1],"training":[2,128,148,173],"data,":[3],"multimodal":[4],"large":[5],"language":[6],"models":[7],"(MLLMs)":[8],"have":[9],"demonstrated":[10],"formidable":[11],"general":[12],"visual":[13,27,45,67,97,118,122,140],"comprehension":[14],"capabilities":[15],"and":[16,96,107,120,129,144,175],"achieved":[17],"remarkable":[18],"performance":[19,25,165],"across":[20,166],"various":[21],"tasks.":[22],"However,":[23],"their":[24],"in":[26,75],"document":[28,46,98,154,168],"understanding":[29,47,169],"still":[30],"leaves":[31],"much":[32],"room":[33],"for":[34,131],"improvement.":[35],"This":[36,100],"discrepancy":[37],"is":[38,48],"primarily":[39],"attributed":[40],"to":[41,62,112,150,179],"the":[42,92,116,152],"fact":[43],"that":[44,160],"a":[49,63,84,103,138,146],"fine-grained":[50],"prediction":[51],"task.":[52],"In":[53,79,134],"natural":[54],"scenes,":[55],"MLLMs":[56,71,95],"typically":[57],"use":[58],"low-resolution":[59,180],"images,":[60],"leading":[61],"substantial":[64],"loss":[65],"of":[66],"information.":[68],"Furthermore,":[69],"general-purpose":[70],"do":[72],"not":[73],"excel":[74],"handling":[76],"document-oriented":[77,139],"instructions.":[78],"this":[80],"paper,":[81],"we":[82,136],"propose":[83],"High-Resolution":[85],"Visual":[86],"Document":[87],"Assistant":[88],"(HRVDA),":[89],"which":[90],"bridges":[91],"gap":[93],"between":[94],"understanding.":[99],"model":[101,127,162],"employs":[102],"content":[104],"filtering":[105,110],"mechanism":[106],"an":[108],"instruction":[109,141],"module":[111],"separately":[113],"filter":[114],"out":[115],"content-agnostic":[117],"tokens":[119],"instruction-agnostic":[121],"tokens,":[123],"thereby":[124],"achieving":[125],"efficient":[126],"inference":[130,176],"high-resolution":[132],"images.":[133],"addition,":[135],"construct":[137],"tuning":[142],"dataset":[143],"apply":[145],"multi-stage":[147],"strategy":[149],"enhance":[151],"model's":[153],"modeling":[155],"capabilities.":[156],"Extensive":[157],"experiments":[158],"demonstrate":[159],"our":[161],"achieves":[163],"state-of-the-art":[164],"multiple":[167],"datasets,":[170],"while":[171],"maintaining":[172],"efficiency":[174],"speed":[177],"comparable":[178],"models.":[181]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4394736689","counts_by_year":[],"updated_date":"2025-02-20T05:41:46.658039","created_date":"2024-04-12"}