{"id":"https://openalex.org/W4306176903","doi":"https://doi.org/10.48550/arxiv.2210.05861","title":"SlotFormer: Unsupervised Visual Dynamics Simulation with Object-Centric Models","display_name":"SlotFormer: Unsupervised Visual Dynamics Simulation with Object-Centric Models","publication_year":2022,"publication_date":"2022-01-01","ids":{"openalex":"https://openalex.org/W4306176903","doi":"https://doi.org/10.48550/arxiv.2210.05861"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2210.05861","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2210.05861","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5076788807","display_name":"Ziyi Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Ziyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003458137","display_name":"Nikita Dvornik","orcid":"https://orcid.org/0000-0003-4770-3427"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dvornik, Nikita","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082808731","display_name":"Klaus Greff","orcid":"https://orcid.org/0000-0001-6982-0937"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Greff, Klaus","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025691113","display_name":"Thomas Kipf","orcid":"https://orcid.org/0000-0003-4486-409X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kipf, Thomas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5061193324","display_name":"Animesh Garg","orcid":"https://orcid.org/0000-0003-0482-4296"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Garg, Animesh","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.820962,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":88,"max":90},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9964,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9856,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dynamics","display_name":"Dynamics","score":0.46672028},{"id":"https://openalex.org/keywords/ground-truth","display_name":"Ground truth","score":0.45022577}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.83488584},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.69407225},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6384219},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.53482133},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.5075947},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.46672028},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.45088422},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.45022577},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2210.05861","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2210.05861","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2210.05861","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4327525404","https://openalex.org/W4295532600","https://openalex.org/W4287185323","https://openalex.org/W3150905897","https://openalex.org/W2171218219","https://openalex.org/W2150410159","https://openalex.org/W2063823869","https://openalex.org/W2047973478","https://openalex.org/W1972271943","https://openalex.org/W1520183331"],"abstract_inverted_index":{"Understanding":[0],"dynamics":[1,34,98,125],"from":[2,14],"visual":[3,141],"observations":[4],"is":[5,178],"a":[6,28,38,48,58,171],"challenging":[7],"problem":[8,43],"that":[9,159],"requires":[10],"disentangling":[11],"individual":[12],"objects":[13],"the":[15,95,105,151],"scene":[16,29],"and":[17,71,117],"learning":[18],"their":[19,33],"interactions.":[20,93],"While":[21],"recent":[22],"object-centric":[23,55],"models":[24,147],"can":[25,100],"successfully":[26,81],"decompose":[27],"into":[30],"objects,":[31],"modeling":[32],"effectively":[35],"still":[36],"remains":[37],"challenge.":[39],"We":[40],"address":[41],"this":[42,78],"by":[44],"introducing":[45],"SlotFormer":[46,83,144],"--":[47],"Transformer-based":[49],"autoregressive":[50],"model":[51,68,99,173],"operating":[52],"on":[53,88,107,124],"learned":[54],"representations.":[56],"Given":[57],"video":[59,86],"clip,":[60],"our":[61,127],"approach":[62],"reasons":[63],"over":[64],"object":[65,75,92,135],"features":[66],"to":[67,84,103,121,148,168],"spatio-temporal":[69],"relationships":[70],"predicts":[72],"accurate":[73],"future":[74,152],"states.":[76],"In":[77],"paper,":[79],"we":[80,164],"apply":[82],"perform":[85],"prediction":[87],"datasets":[89],"with":[90,180],"complex":[91],"Moreover,":[94],"unsupervised":[96],"SlotFormer's":[97],"be":[101],"used":[102],"improve":[104],"performance":[106],"supervised":[108],"downstream":[109],"tasks,":[110],"such":[111,185],"as":[112,170],"Visual":[113],"Question":[114],"Answering":[115],"(VQA),":[116],"goal-conditioned":[118],"planning.":[119],"Compared":[120],"past":[122],"works":[123],"modeling,":[126],"method":[128],"achieves":[129],"significantly":[130],"better":[131],"long-term":[132],"synthesis":[133],"of":[134],"dynamics,":[136],"while":[137],"retaining":[138],"high":[139],"quality":[140],"generation.":[142],"Besides,":[143],"enables":[145],"VQA":[146],"reason":[149],"about":[150],"without":[153],"object-level":[154],"labels,":[155],"even":[156],"outperforming":[157],"counterparts":[158],"use":[160],"ground-truth":[161],"annotations.":[162],"Finally,":[163],"show":[165],"its":[166],"ability":[167],"serve":[169],"world":[172],"for":[174,184],"model-based":[175],"planning,":[176],"which":[177],"competitive":[179],"methods":[181],"designed":[182],"specifically":[183],"tasks.":[186]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4306176903","counts_by_year":[{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":6}],"updated_date":"2025-04-30T11:20:30.429368","created_date":"2022-10-14"}