{"id":"https://openalex.org/W4386057806","doi":"https://doi.org/10.1109/cvpr52729.2023.02207","title":"Towards Generalisable Video Moment Retrieval: Visual-Dynamic Injection to Image-Text Pre-Training","display_name":"Towards Generalisable Video Moment Retrieval: Visual-Dynamic Injection to Image-Text Pre-Training","publication_year":2023,"publication_date":"2023-06-01","ids":{"openalex":"https://openalex.org/W4386057806","doi":"https://doi.org/10.1109/cvpr52729.2023.02207"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52729.2023.02207","pdf_url":null,"source":{"id":"https://openalex.org/S4363607701","display_name":"2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2303.00040","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052937695","display_name":"Dezhao Luo","orcid":null},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Dezhao Luo","raw_affiliation_strings":["Queen Mary University of London"],"affiliations":[{"raw_affiliation_string":"Queen Mary University of London","institution_ids":["https://openalex.org/I166337079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088580627","display_name":"Jiabo Huang","orcid":"https://orcid.org/0000-0001-7900-3439"},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Jiabo Huang","raw_affiliation_strings":["Queen Mary University of London"],"affiliations":[{"raw_affiliation_string":"Queen Mary University of London","institution_ids":["https://openalex.org/I166337079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039302902","display_name":"Shaogang Gong","orcid":"https://orcid.org/0000-0001-8156-2299"},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Shaogang Gong","raw_affiliation_strings":["Queen Mary University of London"],"affiliations":[{"raw_affiliation_string":"Queen Mary University of London","institution_ids":["https://openalex.org/I166337079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109787260","display_name":"Hailin Jin","orcid":null},"institutions":[{"id":"https://openalex.org/I1306409833","display_name":"Adobe Systems (United States)","ror":"https://ror.org/059tvcg64","country_code":"US","type":"company","lineage":["https://openalex.org/I1306409833"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hailin Jin","raw_affiliation_strings":["Adobe Research"],"affiliations":[{"raw_affiliation_string":"Adobe Research","institution_ids":["https://openalex.org/I1306409833"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100355884","display_name":"Yang Liu","orcid":"https://orcid.org/0000-0002-4259-3882"},"institutions":[{"id":"https://openalex.org/I111483173","display_name":"King University","ror":"https://ror.org/01evb6z23","country_code":"US","type":"education","lineage":["https://openalex.org/I111483173"]},{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Yang Liu","raw_affiliation_strings":["WICT, Peking University"],"affiliations":[{"raw_affiliation_string":"WICT, Peking University","institution_ids":["https://openalex.org/I111483173","https://openalex.org/I20231570"]}]}],"institution_assertions":[],"countries_distinct_count":3,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.385,"has_fulltext":false,"cited_by_count":16,"citation_normalized_percentile":{"value":0.999956,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.997,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.996,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.47623876},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4418727}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7795309},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5816779},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.53893536},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5266312},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.47908813},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.47623876},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.45489022},{"id":"https://openalex.org/C179254644","wikidata":"https://www.wikidata.org/wiki/Q13222844","display_name":"Moment (physics)","level":2,"score":0.4453399},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4418727},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4385694},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.42518798},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.41235682},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.41118246},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3695348},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C74650414","wikidata":"https://www.wikidata.org/wiki/Q11397","display_name":"Classical mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52729.2023.02207","pdf_url":null,"source":{"id":"https://openalex.org/S4363607701","display_name":"2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2303.00040","pdf_url":"https://arxiv.org/pdf/2303.00040","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2303.00040","pdf_url":"https://arxiv.org/pdf/2303.00040","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"score":0.78,"display_name":"Quality education","id":"https://metadata.un.org/sdg/4"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":60,"referenced_works":["https://openalex.org/W1522734439","https://openalex.org/W1686810756","https://openalex.org/W1927052826","https://openalex.org/W2014826060","https://openalex.org/W2064675550","https://openalex.org/W21006490","https://openalex.org/W2108598243","https://openalex.org/W2154579312","https://openalex.org/W2194775991","https://openalex.org/W2619947201","https://openalex.org/W2890502146","https://openalex.org/W2895583029","https://openalex.org/W2896457183","https://openalex.org/W2908510526","https://openalex.org/W2931886155","https://openalex.org/W2949789546","https://openalex.org/W2952686080","https://openalex.org/W2963095467","https://openalex.org/W2963916161","https://openalex.org/W2964089981","https://openalex.org/W2964232540","https://openalex.org/W2978017171","https://openalex.org/W2981851019","https://openalex.org/W2984008963","https://openalex.org/W2997429269","https://openalex.org/W2998712570","https://openalex.org/W3034743747","https://openalex.org/W3035339529","https://openalex.org/W3035635319","https://openalex.org/W3035640828","https://openalex.org/W3094502228","https://openalex.org/W3126337491","https://openalex.org/W3145807616","https://openalex.org/W3152619510","https://openalex.org/W3156595259","https://openalex.org/W3166396011","https://openalex.org/W3175082063","https://openalex.org/W3175817778","https://openalex.org/W3177487519","https://openalex.org/W3180353325","https://openalex.org/W3184735396","https://openalex.org/W3193772061","https://openalex.org/W3197804339","https://openalex.org/W3199096350","https://openalex.org/W3200114289","https://openalex.org/W3215626407","https://openalex.org/W3216156094","https://openalex.org/W4221154629","https://openalex.org/W4226058394","https://openalex.org/W4230025115","https://openalex.org/W4285192809","https://openalex.org/W4285606530","https://openalex.org/W4304086137","https://openalex.org/W4312245888","https://openalex.org/W4312480274","https://openalex.org/W4312509824","https://openalex.org/W4312558481","https://openalex.org/W4312920106","https://openalex.org/W4313011746","https://openalex.org/W4385245566"],"related_works":["https://openalex.org/W4387804363","https://openalex.org/W2743859443","https://openalex.org/W2477150073","https://openalex.org/W2373350752","https://openalex.org/W2326995835","https://openalex.org/W2163296013","https://openalex.org/W2123347777","https://openalex.org/W2059402478","https://openalex.org/W2019547100","https://openalex.org/W165915117"],"abstract_inverted_index":{"The":[0],"correlation":[1],"between":[2],"the":[3,60,68,84,104,108,143,153,165,202,206],"vision":[4],"and":[5,26,130,137,157,187,189,212],"text":[6,105,167],"is":[7,35,111],"essential":[8],"for":[9,24],"video":[10,69,88,99,135,146],"moment":[11],"retrieval":[12],"(VMR),":[13],"however,":[14],"existing":[15,91],"methods":[16,93],"heavily":[17],"rely":[18],"on":[19,66,96,181,201],"separate":[20],"pre-training":[21,64,118],"feature":[22],"extractors":[23],"visual":[25,128,156],"textual":[27],"understanding.":[28],"Without":[29],"sufficient":[30],"temporal":[31,109],"boundary":[32],"annotations,":[33],"it":[34],"non-trivial":[36],"to":[37,54,77,82,171],"learn":[38],"universal":[39],"video-text":[40,175],"alignments.":[41,176],"In":[42],"this":[43],"work,":[44],"we":[45,71,126],"explore":[46],"multi-modal":[47],"correlations":[48],"derived":[49],"from":[50,134],"large-scale":[51],"image-text":[52,63],"data":[53],"facilitate":[55],"generalisable":[56],"VMR.":[57],"To":[58],"address":[59],"limitations":[61],"of":[62,87,103],"models":[65],"capturing":[67],"changes,":[70],"propose":[72],"a":[73],"generic":[74],"method,":[75],"referred":[76],"as":[78],"Visual-Dynamic":[79],"Injection":[80],"(VDI),":[81],"empower":[83],"model's":[85],"understanding":[86],"moments.":[89],"Whilst":[90],"VMR":[92,183],"are":[94,162],"focusing":[95],"building":[97],"temporalaware":[98],"features,":[100],"being":[101,199],"aware":[102],"descriptions":[106],"about":[107],"changes":[110,147],"also":[112],"critical":[113],"but":[114],"originally":[115],"overlooked":[116],"in":[117,160,164],"by":[119],"matching":[120],"static":[121],"images":[122],"with":[123,142],"sentences.":[124],"Therefore,":[125],"extract":[127],"context":[129],"spatial":[131],"dynamic":[132],"information":[133],"frames":[136],"explicitly":[138],"enforce":[139],"their":[140],"alignments":[141],"phrases":[144],"describing":[145],"(e.g.":[148],"verb).":[149],"By":[150],"doing":[151],"so,":[152],"potentially":[154],"relevant":[155],"motion":[158],"patterns":[159],"videos":[161],"encoded":[163],"corresponding":[166],"embeddings":[168],"(injected)":[169],"so":[170],"enable":[172],"more":[173],"accurate":[174],"We":[177],"conduct":[178],"extensive":[179],"experiments":[180],"two":[182],"benchmark":[184],"datasets":[185],"(Charades-STA":[186],"ActivityNet-Captions)":[188],"achieve":[190],"state-of-the-art":[191],"performances.":[192],"Especially,":[193],"VDI":[194],"yields":[195],"notable":[196],"advantages":[197],"when":[198],"tested":[200],"out-of-distribution":[203],"splits":[204],"where":[205],"testing":[207],"samples":[208],"involve":[209],"novel":[210],"scenes":[211],"vocabulary.":[213]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4386057806","counts_by_year":[{"year":2024,"cited_by_count":13},{"year":2023,"cited_by_count":3}],"updated_date":"2025-01-04T15:37:00.603862","created_date":"2023-08-23"}