{"id":"https://openalex.org/W3019301826","doi":"https://doi.org/10.1109/tip.2020.2988435","title":"Video Captioning With Object-Aware Spatio-Temporal Correlation and Aggregation","display_name":"Video Captioning With Object-Aware Spatio-Temporal Correlation and Aggregation","publication_year":2020,"publication_date":"2020-01-01","ids":{"openalex":"https://openalex.org/W3019301826","doi":"https://doi.org/10.1109/tip.2020.2988435","mag":"3019301826","pmid":"https://pubmed.ncbi.nlm.nih.gov/32356746"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/tip.2020.2988435","pdf_url":null,"source":{"id":"https://openalex.org/S4210173141","display_name":"IEEE Transactions on Image Processing","issn_l":"1057-7149","issn":["1057-7149","1941-0042"],"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"journal-article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5087793912","display_name":"Junchao Zhang","orcid":"https://orcid.org/0000-0003-2243-0012"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"funder","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junchao Zhang","raw_affiliation_strings":["Wangxuan Institute of Computer Technology, Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5047811387","display_name":"Yuxin Peng","orcid":"https://orcid.org/0000-0001-7658-3845"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"funder","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuxin Peng","raw_affiliation_strings":["Wangxuan Institute of Computer Technology, Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Wangxuan Institute of Computer Technology, Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.303,"has_fulltext":false,"cited_by_count":52,"citation_normalized_percentile":{"value":0.999944,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":"29","issue":null,"first_page":"6209","last_page":"6222"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9985,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9969,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.81776357},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.59907037},{"id":"https://openalex.org/keywords/spatial-relation","display_name":"Spatial relation","score":0.53205},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.52738994},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.4847488}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.82034886},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.81776357},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6755815},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.6273938},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.59907037},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.53205},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.52738994},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.5201069},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5042583},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4938789},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.4847488},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.48025778},{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.4136416},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.39235616},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.15674204},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.12487325},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.09250978},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/tip.2020.2988435","pdf_url":null,"source":{"id":"https://openalex.org/S4210173141","display_name":"IEEE Transactions on Image Processing","issn_l":"1057-7149","issn":["1057-7149","1941-0042"],"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/32356746","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":["National Institutes of Health"],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.72,"display_name":"Reduced inequalities"}],"grants":[{"funder":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China","award_id":"61925201"},{"funder":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China","award_id":"61771025"}],"datasets":[],"versions":[],"referenced_works_count":80,"referenced_works":["https://openalex.org/W1522734439","https://openalex.org/W1586939924","https://openalex.org/W1861492603","https://openalex.org/W1889081078","https://openalex.org/W1924770834","https://openalex.org/W1950136256","https://openalex.org/W1956340063","https://openalex.org/W1984309565","https://openalex.org/W2012592962","https://openalex.org/W2097117768","https://openalex.org/W2101105183","https://openalex.org/W2110933980","https://openalex.org/W2133512280","https://openalex.org/W2133564696","https://openalex.org/W2139501017","https://openalex.org/W2142521298","https://openalex.org/W2142535891","https://openalex.org/W2142900973","https://openalex.org/W2152984213","https://openalex.org/W2156718197","https://openalex.org/W2164290393","https://openalex.org/W2194775991","https://openalex.org/W2425121537","https://openalex.org/W2526050071","https://openalex.org/W2554906389","https://openalex.org/W2559542432","https://openalex.org/W2584992898","https://openalex.org/W2586853845","https://openalex.org/W2588822708","https://openalex.org/W2620629206","https://openalex.org/W2621571501","https://openalex.org/W2737030113","https://openalex.org/W2739107216","https://openalex.org/W2751808960","https://openalex.org/W2752191396","https://openalex.org/W2765658575","https://openalex.org/W2766375149","https://openalex.org/W2766520430","https://openalex.org/W2768661419","https://openalex.org/W2770191827","https://openalex.org/W2783838156","https://openalex.org/W2791813490","https://openalex.org/W2791900743","https://openalex.org/W2798725893","https://openalex.org/W2806331055","https://openalex.org/W2808203533","https://openalex.org/W2808647806","https://openalex.org/W2808675313","https://openalex.org/W2887272576","https://openalex.org/W2887712318","https://openalex.org/W2894280539","https://openalex.org/W2896878184","https://openalex.org/W2905172366","https://openalex.org/W2932399282","https://openalex.org/W2940457086","https://openalex.org/W2943294985","https://openalex.org/W2945223572","https://openalex.org/W2948358897","https://openalex.org/W2951702519","https://openalex.org/W2954137266","https://openalex.org/W2962681491","https://openalex.org/W2962756421","https://openalex.org/W2962767366","https://openalex.org/W2962958773","https://openalex.org/W2962994439","https://openalex.org/W2963150697","https://openalex.org/W2963456618","https://openalex.org/W2963465695","https://openalex.org/W2963576560","https://openalex.org/W2963843052","https://openalex.org/W2964015378","https://openalex.org/W2964199361","https://openalex.org/W2964241990","https://openalex.org/W2964242696","https://openalex.org/W2964308564","https://openalex.org/W2983141445","https://openalex.org/W3103616093","https://openalex.org/W3104097132","https://openalex.org/W4249013746","https://openalex.org/W4294558607"],"related_works":["https://openalex.org/W4290852288","https://openalex.org/W4283207562","https://openalex.org/W4210416330","https://openalex.org/W3088136942","https://openalex.org/W2963177403","https://openalex.org/W2949362007","https://openalex.org/W2775506363","https://openalex.org/W2330246314","https://openalex.org/W2149051193","https://openalex.org/W1545923090"],"abstract_inverted_index":{"Video":[0],"captioning":[1,70],"is":[2,28,66,136,178,195,244],"a":[3],"significant":[4],"challenging":[5],"task":[6],"in":[7,47,82,182,269],"computer":[8],"vision":[9],"and":[10,44,61,78,114,126,139,191,260,274],"natural":[11,21],"language":[12,22],"processing,":[13],"aiming":[14],"to":[15,37,71,100,145,159,197,222,247],"automatically":[16],"describe":[17],"video":[18,27,33,69,94,238],"content":[19,43],"by":[20,185],"sentences.":[23],"Comprehensive":[24],"understanding":[25],"of":[26,55,215,250,271],"the":[29,41,52,73,107,115,142,161],"key":[30],"for":[31,68,93,150,165,205,236],"accurate":[32],"captioning,":[34],"which":[35,155,194,231],"needs":[36],"not":[38],"only":[39],"capture":[40,72,160],"global":[42],"salient":[45,167,206],"objects":[46,102,151,181],"video,":[48],"but":[49],"also":[50,245],"understand":[51],"spatio-temporal":[53,89,98],"relations":[54],"objects,":[56],"including":[57],"their":[58,104,187],"temporal":[59,108,131,134,143,148,163],"trajectories":[60,164],"spatial":[62,116,172,189],"relationships.":[63,122],"Thus,":[64],"it":[65],"important":[67],"objects'":[74,111,119,228],"relationships":[75,204],"both":[76],"within":[77],"across":[79,152],"frames.":[80],"Therefore,":[81],"this":[83],"paper,":[84],"we":[85],"propose":[86],"an":[87],"object-aware":[88,224],"graph":[90,135,177],"(OSTG)":[91],"approach":[92,265],"captioning.":[95,239],"It":[96],"constructs":[97],"graphs":[99,109,117],"depict":[101],"with":[103],"relations,":[105],"where":[106],"represent":[110,118],"inter-frame":[112,162],"dynamics,":[113],"intra-frame":[120,203],"interactive":[121],"The":[123],"main":[124],"novelties":[125],"advantages":[127],"are:":[128],"(1)":[129],"Bidirectional":[130,133],"alignment:":[132],"constructed":[137,179],"along":[138,141],"reversely":[140],"order":[144],"perform":[146,223],"bidirectional":[147],"alignment":[149],"different":[153,251],"frames,":[154],"provides":[156],"complementary":[157],"clues":[158],"each":[166,183],"object.":[168],"(2)":[169],"Graph":[170],"based":[171],"relation":[173,176,199],"learning:":[174],"Spatial":[175],"among":[180],"frame":[184],"considering":[186],"relative":[188],"locations":[190],"semantic":[192],"correlations,":[193],"exploited":[196],"learn":[198,232],"features":[200],"that":[201],"encode":[202],"objects.":[207],"(3)":[208],"Object-aware":[209],"feature":[210,225],"aggregation:":[211],"Trainable":[212],"VLAD":[213],"(vector":[214],"locally":[216],"aggregated":[217,234],"descriptors)":[218],"models":[219],"are":[220],"deployed":[221],"aggregation":[226],"on":[227,255],"local":[229],"features,":[230],"discriminative":[233],"representations":[235],"better":[237],"A":[240],"hierarchical":[241],"attention":[242],"mechanism":[243],"developed":[246],"distinguish":[248],"contributions":[249],"object":[252],"instances.":[253],"Experiments":[254],"two":[256],"widely-used":[257],"datasets,":[258],"MSR-VTT":[259],"MSVD,":[261],"demonstrate":[262],"our":[263],"proposed":[264],"achieves":[266],"state-of-the-art":[267],"performances":[268],"terms":[270],"BLEU@4,":[272],"METEOR":[273],"CIDEr":[275],"metrics.":[276]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W3019301826","counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":10},{"year":2022,"cited_by_count":22},{"year":2021,"cited_by_count":11},{"year":2020,"cited_by_count":2}],"updated_date":"2025-03-30T16:10:50.439103","created_date":"2020-05-01"}