{"id":"https://openalex.org/W4400104339","doi":"https://doi.org/10.48550/arxiv.2406.17880","title":"MLLM as Video Narrator: Mitigating Modality Imbalance in Video Moment\n Retrieval","display_name":"MLLM as Video Narrator: Mitigating Modality Imbalance in Video Moment\n Retrieval","publication_year":2024,"publication_date":"2024-06-25","ids":{"openalex":"https://openalex.org/W4400104339","doi":"https://doi.org/10.48550/arxiv.2406.17880"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.17880","pdf_url":"http://arxiv.org/pdf/2406.17880","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2406.17880","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102651416","display_name":"Weitong Cai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Weitong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088580627","display_name":"Jiabo Huang","orcid":"https://orcid.org/0000-0001-7900-3439"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Jiabo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039302902","display_name":"Shaogang Gong","orcid":"https://orcid.org/0000-0001-8156-2299"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Shaogang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109787260","display_name":"Hailin Jin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Hailin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5088963844","display_name":"Yang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yang","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":77},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9994,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9994,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9988,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9968,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.78380233}],"concepts":[{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.78380233},{"id":"https://openalex.org/C179254644","wikidata":"https://www.wikidata.org/wiki/Q13222844","display_name":"Moment (physics)","level":2,"score":0.6376006},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.48109716},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2448737},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.082753},{"id":"https://openalex.org/C74650414","wikidata":"https://www.wikidata.org/wiki/Q11397","display_name":"Classical mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.17880","pdf_url":"http://arxiv.org/pdf/2406.17880","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.17880","pdf_url":"http://arxiv.org/pdf/2406.17880","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4396701345","https://openalex.org/W4396696052","https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2530972254","https://openalex.org/W2390279801","https://openalex.org/W2385859805","https://openalex.org/W2376932109","https://openalex.org/W2358668433","https://openalex.org/W2001405890"],"abstract_inverted_index":{"Video":[0],"Moment":[1],"Retrieval":[2],"(VMR)":[3],"aims":[4],"to":[5,80,110,137,178,199],"localize":[6],"a":[7,17,35,54,74,107,148,190],"specific":[8],"temporal":[9,126,131,176],"segment":[10],"within":[11,70],"an":[12,104],"untrimmed":[13],"long":[14],"video":[15,40,108,144,175,181],"given":[16],"natural":[18],"language":[19,96],"query.":[20],"Existing":[21],"methods":[22],"often":[23],"suffer":[24],"from":[25,203],"inadequate":[26],"training":[27],"annotations,":[28],"i.e.,":[29],"the":[30,38,43,66,71,89,116,120,125,160,170,197,216],"sentence":[31],"typically":[32],"matches":[33],"with":[34,45,62,152,159],"fraction":[36],"of":[37,57,73,93,115,220],"prominent":[39],"content":[41],"in":[42,99],"foreground":[44],"limited":[46,75],"wording":[47],"diversity.":[48],"This":[49],"intrinsic":[50],"modality":[51,121],"imbalance":[52,122],"leaves":[53],"considerable":[55],"portion":[56],"visual":[58,161],"information":[59,202],"remaining":[60],"unaligned":[61],"text.":[63],"It":[64],"confines":[65],"cross-modal":[67,166],"alignment":[68],"knowledge":[69],"scope":[72],"text":[76,139,150],"corpus,":[77],"thereby":[78,118],"leading":[79],"sub-optimal":[81],"visual-textual":[82,90],"modeling":[83],"and":[84,123,146,173,218],"poor":[85],"generalizability.":[86],"By":[87],"leveraging":[88],"understanding":[91],"capability":[92],"multi-modal":[94],"large":[95],"models":[97],"(MLLM),":[98],"this":[100],"work,":[101],"we":[102,135,164,188],"take":[103],"MLLM":[105],"as":[106],"narrator":[109],"generate":[111],"plausible":[112],"textual":[113],"descriptions":[114,206],"video,":[117],"mitigating":[119],"boosting":[124],"localization.":[127,186],"To":[128],"effectively":[129],"maintain":[130],"sensibility":[132],"for":[133,141,184,207],"localization,":[134],"design":[136],"get":[138],"narratives":[140,172],"each":[142],"certain":[143],"timestamp":[145],"construct":[147],"structured":[149],"paragraph":[151],"time":[153],"information,":[154],"which":[155,195],"is":[156],"temporally":[157],"aligned":[158],"content.":[162],"Then":[163],"perform":[165],"feature":[167],"merging":[168],"between":[169],"temporal-aware":[171],"corresponding":[174],"features":[177],"produce":[179],"semantic-enhanced":[180],"representation":[182],"sequences":[183],"query":[185],"Subsequently,":[187],"introduce":[189],"uni-modal":[191],"narrative-query":[192],"matching":[193],"mechanism,":[194],"encourages":[196],"model":[198],"extract":[200],"complementary":[201],"contextual":[204],"cohesive":[205],"improved":[208],"retrieval.":[209],"Extensive":[210],"experiments":[211],"on":[212],"two":[213],"benchmarks":[214],"show":[215],"effectiveness":[217],"generalizability":[219],"our":[221],"proposed":[222],"method.":[223]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4400104339","counts_by_year":[],"updated_date":"2025-04-23T02:21:22.998612","created_date":"2024-06-28"}