{"id":"https://openalex.org/W4392427785","doi":"https://doi.org/10.48550/arxiv.2403.00587","title":"Improving Explicit Spatial Relationships in Text-to-Image Generation\n through an Automatically Derived Dataset","display_name":"Improving Explicit Spatial Relationships in Text-to-Image Generation\n through an Automatically Derived Dataset","publication_year":2024,"publication_date":"2024-03-01","ids":{"openalex":"https://openalex.org/W4392427785","doi":"https://doi.org/10.48550/arxiv.2403.00587"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2403.00587","pdf_url":"https://arxiv.org/pdf/2403.00587","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2403.00587","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5057219485","display_name":"Ander Salaberria","orcid":"https://orcid.org/0000-0002-4277-3939"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Salaberria, Ander","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007160780","display_name":"Gorka Azkune","orcid":"https://orcid.org/0000-0002-2506-7426"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Azkune, Gorka","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061807922","display_name":"Oier L\u00f3pez de Lacalle","orcid":"https://orcid.org/0000-0003-4969-2055"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"de Lacalle, Oier Lopez","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053230169","display_name":"Aitor Soroa","orcid":"https://orcid.org/0000-0001-8573-2654"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Soroa, Aitor","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047151336","display_name":"Eneko Agirre","orcid":"https://orcid.org/0000-0002-0195-4899"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Agirre, Eneko","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5054936589","display_name":"Frank Keller","orcid":"https://orcid.org/0000-0002-8242-4362"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Keller, Frank","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":77},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.9397,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.9397,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9352,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.929,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.66654956},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.58871984},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45322925},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.36248332},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.35426682},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3521548},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.32749414}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2403.00587","pdf_url":"https://arxiv.org/pdf/2403.00587","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2403.00587","pdf_url":"https://arxiv.org/pdf/2403.00587","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3116076068","https://openalex.org/W2789220062","https://openalex.org/W2779427294","https://openalex.org/W2775347418","https://openalex.org/W2755342338","https://openalex.org/W2625805835","https://openalex.org/W2563206327","https://openalex.org/W2108687567","https://openalex.org/W2079911747","https://openalex.org/W2069885731"],"abstract_inverted_index":{"Existing":[0],"work":[1],"has":[2],"observed":[3],"that":[4,25,55,115,126,154,177],"current":[5],"text-to-image":[6,122],"systems":[7],"do":[8],"not":[9],"accurately":[10],"reflect":[11],"explicit":[12,29,58],"spatial":[13,30,59],"relations":[14,31],"between":[15],"objects":[16,101],"such":[17],"as":[18,134],"'left":[19],"of'":[20],"or":[21],"'below'.":[22],"We":[23,43,61,124],"hypothesize":[24],"this":[26],"is":[27,111,156,179],"because":[28],"rarely":[32],"appear":[33],"in":[34,102,142,149],"the":[35,63,98,103,112,143,150,165,187],"image":[36],"captions":[37,54,83,107],"used":[38,118],"to":[39,88,119,138,158,160],"train":[40,104],"these":[41],"models.":[42],"propose":[44],"an":[45,94],"automatic":[46],"method":[47],"that,":[48],"given":[49],"existing":[50],"images,":[51],"generates":[52],"synthetic":[53],"contain":[56],"14":[57],"relations.":[60,183],"introduce":[62],"Spatial":[64],"Relation":[65],"for":[66,76,84,181],"Generation":[67],"(SR4G)":[68],"dataset,":[69],"which":[70],"contains":[71],"9.9":[72],"millions":[73],"image-caption":[74],"pairs":[75],"training,":[77],"and":[78,105,170,186],"more":[79],"than":[80],"60":[81],"thousand":[82],"evaluation.":[85],"In":[86],"order":[87],"test":[89,106],"generalization":[90],"we":[91],"also":[92],"provide":[93],"'unseen'":[95,151],"split,":[96,152],"where":[97],"set":[99],"of":[100],"are":[108],"disjoint.":[109],"SR4G":[110],"first":[113],"dataset":[114,185],"can":[116],"be":[117,190],"spatially":[120],"fine-tune":[121],"systems.":[123],"show":[125],"fine-tuning":[127],"two":[128],"different":[129],"Stable":[130],"Diffusion":[131],"models":[132],"(denoted":[133],"SD$_{SR4G}$)":[135],"yields":[136],"up":[137],"9":[139],"points":[140],"improvements":[141],"VISOR":[144],"metric.":[145],"The":[146,184],"improvement":[147,178],"holds":[148],"showing":[153],"SD$_{SR4G}$":[155,163],"able":[157],"generalize":[159],"unseen":[161],"objects.":[162],"improves":[164],"state-of-the-art":[166],"with":[167],"fewer":[168],"parameters,":[169],"avoids":[171],"complex":[172],"architectures.":[173],"Our":[174],"analysis":[175],"shows":[176],"consistent":[180],"all":[182],"code":[188],"will":[189],"publicly":[191],"available.":[192]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4392427785","counts_by_year":[],"updated_date":"2025-04-19T12:38:40.988358","created_date":"2024-03-05"}