{"id":"https://openalex.org/W4404986724","doi":"https://doi.org/10.48550/arxiv.2411.15714","title":"ROOT: VLM based System for Indoor Scene Understanding and Beyond","display_name":"ROOT: VLM based System for Indoor Scene Understanding and Beyond","publication_year":2024,"publication_date":"2024-11-23","ids":{"openalex":"https://openalex.org/W4404986724","doi":"https://doi.org/10.48550/arxiv.2411.15714"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.15714","pdf_url":"http://arxiv.org/pdf/2411.15714","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2411.15714","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100319205","display_name":"Yonghui Wang","orcid":"https://orcid.org/0000-0003-4334-0921"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yonghui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067624396","display_name":"Shi-Yong Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Shi-Yong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040082179","display_name":"Z. Zhou","orcid":"https://orcid.org/0000-0002-9401-6390"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Zhenxing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014850295","display_name":"Shengbiao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Siyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083071261","display_name":"H Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haoran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113273090","display_name":"W. Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Wengang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5083071261","display_name":"H Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Houqiang","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":82},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9499,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9499,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11164","display_name":"Remote Sensing and LiDAR Applications","score":0.934,"subfield":{"id":"https://openalex.org/subfields/2305","display_name":"Environmental Engineering"},"field":{"id":"https://openalex.org/fields/23","display_name":"Environmental Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.9264,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/root","display_name":"Root (linguistics)","score":0.75067985}],"concepts":[{"id":"https://openalex.org/C171078966","wikidata":"https://www.wikidata.org/wiki/Q111029","display_name":"Root (linguistics)","level":2,"score":0.75067985},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.41868436},{"id":"https://openalex.org/C39432304","wikidata":"https://www.wikidata.org/wiki/Q188847","display_name":"Environmental science","level":0,"score":0.34397954},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.1091423},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0788697}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.15714","pdf_url":"http://arxiv.org/pdf/2411.15714","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.15714","pdf_url":"http://arxiv.org/pdf/2411.15714","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4396701345","https://openalex.org/W4396696052","https://openalex.org/W4391913857","https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2376932109","https://openalex.org/W2358668433","https://openalex.org/W2001405890"],"abstract_inverted_index":{"Recently,":[0],"Vision":[1],"Language":[2],"Models":[3],"(VLMs)":[4],"have":[5],"experienced":[6],"significant":[7],"advancements,":[8],"yet":[9],"these":[10],"models":[11,64],"still":[12],"face":[13],"challenges":[14],"in":[15,176],"spatial":[16,92,112],"hierarchical":[17,93],"reasoning":[18],"within":[19,54,102],"indoor":[20,37,55,103,115,129,149,170],"scenes.":[21,38,56,116],"In":[22],"this":[23,78,153],"study,":[24],"we":[25,40,81,121,156],"introduce":[26],"ROOT,":[27],"a":[28,83,133,139],"VLM-based":[29],"system":[30],"designed":[31],"to":[32,50,65,142],"enhance":[33],"the":[34,70,111],"analysis":[35],"of":[36,90,110,114],"Specifically,":[39],"first":[41],"develop":[42],"an":[43],"iterative":[44],"object":[45,52],"perception":[46],"algorithm":[47],"using":[48],"GPT-4V":[49],"detect":[51],"entities":[53],"This":[57,105],"is":[58,88],"followed":[59],"by":[60],"employing":[61],"vision":[62],"foundation":[63],"acquire":[66],"additional":[67],"meta-information":[68],"about":[69],"scene,":[71],"such":[72,180],"as":[73,181],"bounding":[74],"boxes.":[75],"Building":[76],"on":[77],"foundational":[79],"data,":[80,155],"propose":[82],"specialized":[84],"VLM,":[85],"SceneVLM,":[86,120],"which":[87],"capable":[89],"generating":[91],"scene":[94,134,171,183],"graphs":[95],"and":[96,131,145,161,173,185],"providing":[97],"distance":[98],"information":[99,106],"for":[100],"objects":[101],"environments.":[104],"enhances":[107],"our":[108,119],"understanding":[109,172],"arrangement":[113],"To":[117],"train":[118],"collect":[122],"over":[123],"610,000":[124],"images":[125],"from":[126],"various":[127,158],"public":[128],"datasets":[130],"implement":[132],"data":[135],"generation":[136,184],"pipeline":[137],"with":[138],"semi-automated":[140],"technique":[141],"establish":[143],"relationships":[144],"estimate":[146],"distances":[147],"among":[148],"objects.":[150],"By":[151],"utilizing":[152],"enriched":[154],"conduct":[157],"training":[159],"recipes":[160],"finish":[162],"SceneVLM.":[163],"Our":[164],"experiments":[165],"demonstrate":[166],"that":[167],"\\rootname":[168],"facilitates":[169],"proves":[174],"effective":[175],"diverse":[177],"downstream":[178],"applications,":[179],"3D":[182],"embodied":[186],"AI.":[187],"The":[188],"code":[189],"will":[190],"be":[191],"released":[192],"at":[193],"\\url{https://github.com/harrytea/ROOT}.":[194]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4404986724","counts_by_year":[],"updated_date":"2025-01-21T08:55:09.126996","created_date":"2024-12-04"}