{"id":"https://openalex.org/W4282967752","doi":"https://doi.org/10.48550/arxiv.2206.06619","title":"TransVG++: End-to-End Visual Grounding with Language Conditioned Vision Transformer","display_name":"TransVG++: End-to-End Visual Grounding with Language Conditioned Vision Transformer","publication_year":2022,"publication_date":"2022-01-01","ids":{"openalex":"https://openalex.org/W4282967752","doi":"https://doi.org/10.48550/arxiv.2206.06619"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2206.06619","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2206.06619","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5089561793","display_name":"Jiajun Deng","orcid":"https://orcid.org/0000-0001-9624-7451"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deng, Jiajun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050209478","display_name":"Zhengyuan Yang","orcid":"https://orcid.org/0000-0002-5808-0889"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zhengyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101424059","display_name":"Daqing Liu","orcid":"https://orcid.org/0000-0002-8286-0105"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Daqing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015882351","display_name":"Tianlang Chen","orcid":"https://orcid.org/0000-0002-6355-6474"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Tianlang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046805800","display_name":"Wengang Zhou","orcid":"https://orcid.org/0000-0003-1690-9836"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Wengang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053344541","display_name":"Yanyong Zhang","orcid":"https://orcid.org/0000-0001-6520-255X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yanyong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078141810","display_name":"Houqiang Li","orcid":"https://orcid.org/0000-0003-2188-3028"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Houqiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5087818121","display_name":"Wanli Ouyang","orcid":"https://orcid.org/0000-0002-9163-2761"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ouyang, Wanli","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":61},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Visual Question Answering in Images and Videos","score":0.9994,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Visual Question Answering in Images and Videos","score":0.9994,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Advances in Transfer Learning and Domain Adaptation","score":0.9987,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Image Feature Retrieval and Recognition Techniques","score":0.9947,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/visual-question-answering","display_name":"Visual Question Answering","score":0.578404},{"id":"https://openalex.org/keywords/multimodal-fusion","display_name":"Multimodal Fusion","score":0.553137},{"id":"https://openalex.org/keywords/transfer-learning","display_name":"Transfer Learning","score":0.533666},{"id":"https://openalex.org/keywords/cross-modal-retrieval","display_name":"Cross-Modal Retrieval","score":0.525782},{"id":"https://openalex.org/keywords/visual-recognition","display_name":"Visual Recognition","score":0.523118},{"id":"https://openalex.org/keywords/resolver","display_name":"Resolver","score":0.4294399}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7267365},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6463667},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.5368263},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5148079},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48492435},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.48301065},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.44822294},{"id":"https://openalex.org/C80156102","wikidata":"https://www.wikidata.org/wiki/Q788036","display_name":"Resolver","level":3,"score":0.4294399},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.42035675},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.21175954},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.14017192},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.112668246},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C165005293","wikidata":"https://www.wikidata.org/wiki/Q1074500","display_name":"Chip","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2206.06619","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2206.06619","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2206.06619","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4210337026","https://openalex.org/W2964964914","https://openalex.org/W2804343227","https://openalex.org/W2588537960","https://openalex.org/W2377847262","https://openalex.org/W2369328568","https://openalex.org/W2158159993","https://openalex.org/W2142531896","https://openalex.org/W2121484818","https://openalex.org/W1526923327"],"abstract_inverted_index":{"In":[0],"this":[1,128],"work,":[2],"we":[3,51,130,141,161],"explore":[4],"neat":[5],"yet":[6],"effective":[7],"Transformer-based":[8,148],"frameworks":[9],"for":[10,155,177],"visual":[11,22,112],"grounding.":[12],"The":[13],"previous":[14],"methods":[15],"generally":[16],"address":[17],"the":[18,92,174,181],"core":[19,93],"problem":[20],"of":[21,84,196],"grounding,":[23],"i.e.,":[24],"multi-modal":[25,57],"fusion":[26,75,94,170,179],"and":[27,61,103,122,172,192],"reasoning,":[28],"with":[29,88],"manually-designed":[30],"mechanisms.":[31],"Such":[32],"heuristic":[33],"designs":[34],"are":[35],"not":[36],"only":[37],"complicated":[38,74],"but":[39],"also":[40],"make":[41,135],"models":[42],"easily":[43],"overfit":[44],"specific":[45],"data":[46],"distributions.":[47],"To":[48,127],"avoid":[49],"this,":[50],"first":[52],"propose":[53],"TransVG,":[54],"which":[55,115],"establishes":[56],"correspondences":[58],"by":[59,65,80,150],"Transformers":[60],"localizes":[62],"referred":[63],"regions":[64],"directly":[66],"regressing":[67],"box":[68],"coordinates.":[69],"We":[70,184],"empirically":[71],"show":[72],"that":[73,167],"modules":[76,171],"can":[77],"be":[78,106,120],"replaced":[79],"a":[81,146,194],"simple":[82],"stack":[83],"Transformer":[85,95,153,166],"encoder":[86],"layers":[87],"higher":[89],"performance.":[90,126],"However,":[91],"in":[96],"TransVG":[97],"is":[98],"stand-alone":[99],"against":[100],"uni-modal":[101,175],"encoders,":[102],"thus":[104],"should":[105],"trained":[107],"from":[108],"scratch":[109],"on":[110,188],"limited":[111],"grounding":[113],"data,":[114],"makes":[116],"it":[117],"hard":[118],"to":[119,124,134,145],"optimized":[121],"leads":[123],"sub-optimal":[125],"end,":[129],"further":[131],"introduce":[132],"TransVG++":[133],"two-fold":[136],"improvements.":[137],"For":[138,159],"one":[139,149],"thing,":[140],"upgrade":[142],"our":[143],"framework":[144],"purely":[147],"leveraging":[151],"Vision":[152,165],"(ViT)":[154],"vision":[156],"feature":[157],"encoding.":[158],"another,":[160],"devise":[162],"Language":[163],"Conditioned":[164],"removes":[168],"external":[169],"reuses":[173],"ViT":[176],"vision-language":[178],"at":[180],"intermediate":[182],"layers.":[183],"conduct":[185],"extensive":[186],"experiments":[187],"five":[189],"prevalent":[190],"datasets,":[191],"report":[193],"series":[195],"state-of-the-art":[197],"records.":[198]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4282967752","counts_by_year":[],"updated_date":"2024-10-27T13:44:40.608280","created_date":"2022-06-17"}