{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T01:35:48Z","timestamp":1740101748168,"version":"3.37.3"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","funder":[{"name":"Alibaba Innovation Research Program"},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62202170"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,4,30]]},"DOI":"10.1145\/3543507.3583365","type":"proceedings-article","created":{"date-parts":[[2023,4,26]],"date-time":"2023-04-26T23:30:51Z","timestamp":1682551851000},"page":"3257-3267","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Match4Match: Enhancing Text-Video Retrieval by Maximum Flow with Minimum Cost"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5973-8240","authenticated-orcid":false,"given":"Zhongjie","family":"Duan","sequence":"first","affiliation":[{"name":"East China Normal University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1010-9678","authenticated-orcid":false,"given":"Chengyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0325-1705","authenticated-orcid":false,"given":"Cen","family":"Chen","sequence":"additional","affiliation":[{"name":"East China Normal University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9967-5515","authenticated-orcid":false,"given":"Wenmeng","family":"Zhou","sequence":"additional","affiliation":[{"name":"Alibaba Group, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7706-7081","authenticated-orcid":false,"given":"Jun","family":"Huang","sequence":"additional","affiliation":[{"name":"Alibaba Group, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4132-8630","authenticated-orcid":false,"given":"Weining","family":"Qian","sequence":"additional","affiliation":[{"name":"East China Normal University, China"}]}],"member":"320","published-online":{"date-parts":[[2023,4,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Arnon Amir Janne Argillander Murray Campbell Alexander Haubold Giridharan Iyengar Shahram Ebadollahi Feng Kang Milind\u00a0R Naphade Apostol Natsev John\u00a0R Smith 2003. IBM Research TRECVID-2003 Video Retrieval System.. In TRECVID."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i8.16822"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_1_5_1","volume-title":"Airline schedule planning: Accomplishments and opportunities. Manufacturing & service operations management 6, 1","author":"Barnhart Cynthia","year":"2004","unstructured":"Cynthia Barnhart and Amy Cohn. 2004. Airline schedule planning: Accomplishments and opportunities. Manufacturing & service operations management 6, 1 (2004), 3\u201322."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1287\/trsc.34.3.239.12300"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00513"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1080\/10556789808805709"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_2_1_11_1","volume-title":"Maximum flow and minimum-cost flow in almost-linear time. arXiv preprint arXiv:2203.00671","author":"Chen Li","year":"2022","unstructured":"Li Chen, Rasmus Kyng, Yang\u00a0P Liu, Richard Peng, Maximilian\u00a0Probst Gutenberg, and Sushant Sachdeva. 2022. Maximum flow and minimum-cost flow in almost-linear time. arXiv preprint arXiv:2203.00671 (2022)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"e_1_3_2_1_13_1","volume-title":"International conference on machine learning. PMLR, 1597\u20131607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597\u20131607."},{"key":"e_1_3_2_1_14_1","volume-title":"Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290","author":"Cheng Xing","year":"2021","unstructured":"Xing Cheng, Hezheng Lin, Xiangyu Wu, Fan Yang, and Dong Shen. 2021. Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290 (2021)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/1993636.1993674"},{"key":"e_1_3_2_1_16_1","volume-title":"Application of the simplex method to a transportation problem. Activity analysis and production and allocation","author":"Dantzig B","year":"1951","unstructured":"George\u00a0B Dantzig. 1951. Application of the simplex method to a transportation problem. Activity analysis and production and allocation (1951)."},{"volume-title":"Doklady Akademii nauk, Vol.\u00a0194","author":"Dinitz A","key":"e_1_3_2_1_17_1","unstructured":"Yefim\u00a0A Dinitz. 1970. An algorithm for the solution of the problem of maximal flow in a network with power estimation. In Doklady Akademii nauk, Vol.\u00a0194. Russian Academy of Sciences, 754\u2013757."},{"key":"e_1_3_2_1_18_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/321694.321699"},{"key":"e_1_3_2_1_20_1","volume-title":"Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097","author":"Fang Han","year":"2021","unstructured":"Han Fang, Pengfei Xiong, Luhui Xu, and Yu Chen. 2021. Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097 (2021)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.4153\/CJM-1956-045-5"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3368089.3417050"},{"key":"e_1_3_2_1_24_1","volume-title":"Clip2tv: An empirical study on transformer-based methods for video-text retrieval. arXiv preprint arXiv:2111.05610","author":"Gao Zijian","year":"2021","unstructured":"Zijian Gao, Jingyu Liu, Sheng Chen, Dedan Chang, Hao Zhang, and Jinwei Yuan. 2021. Clip2tv: An empirical study on transformer-based methods for video-text retrieval. arXiv preprint arXiv:2111.05610 (2021)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1006\/jagm.1995.0805"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1287\/moor.15.3.430"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475241"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSMCC.2011.2109710"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-52921-7_52"},{"key":"e_1_3_2_1_32_1","volume-title":"Product quantization for nearest neighbor search","author":"Jegou Herve","year":"2010","unstructured":"Herve Jegou, Matthijs Douze, and Cordelia Schmid. 2010. Product quantization for nearest neighbor search. IEEE transactions on pattern analysis and machine intelligence 33, 1 (2010), 117\u2013128."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of naacL-HLT. 4171\u20134186","author":"Ming-Wei\u00a0Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei\u00a0Chang Kenton and Lee\u00a0Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of naacL-HLT. 4171\u20134186."},{"key":"e_1_3_2_1_35_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma P","year":"2014","unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_37_1","volume-title":"The Hungarian method for the assignment problem. Naval research logistics quarterly 2, 1-2","author":"Kuhn W","year":"1955","unstructured":"Harold\u00a0W Kuhn. 1955. The Hungarian method for the assignment problem. Naval research logistics quarterly 2, 1-2 (1955), 83\u201397."},{"key":"e_1_3_2_1_38_1","volume-title":"Fine-tuning can distort pretrained features and underperform out-of-distribution. arXiv preprint arXiv:2202.10054","author":"Kumar Ananya","year":"2022","unstructured":"Ananya Kumar, Aditi Raghunathan, Robbie Jones, Tengyu Ma, and Percy Liang. 2022. Fine-tuning can distort pretrained features and underperform out-of-distribution. arXiv preprint arXiv:2202.10054 (2022)."},{"key":"e_1_3_2_1_39_1","volume-title":"One More Step Towards Generalization. arXiv preprint arXiv:2203.07086","author":"Kunitsyn Alexander","year":"2022","unstructured":"Alexander Kunitsyn, Maksim Kalashnikov, Maksim Dzabraev, and Andrei Ivaniuta. 2022. MDMMT-2: Multidomain Multimodal Transformer for Video Retrieval, One More Step Towards Generalization. arXiv preprint arXiv:2203.07086 (2022)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00490"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"e_1_3_2_1_43_1","volume-title":"Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487","author":"Liu Yang","year":"2019","unstructured":"Yang Liu, Samuel Albanie, Arsha Nagrani, and Andrew Zisserman. 2019. Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487 (2019)."},{"key":"e_1_3_2_1_44_1","volume-title":"CLIP4Clip: An Empirical Study of CLIP for End to End Video Clip Retrieval and Captioning. Neurocomputing","author":"Luo Huaishao","year":"2022","unstructured":"Huaishao Luo, Lei Ji, Ming Zhong, Yang Chen, Wen Lei, Nan Duan, and Tianrui Li. 2022. CLIP4Clip: An Empirical Study of CLIP for End to End Video Clip Retrieval and Captioning. Neurocomputing (2022)."},{"key":"e_1_3_2_1_45_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van\u00a0den Oord Aaron","year":"2018","unstructured":"Aaron van\u00a0den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46604-0_46"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.497"},{"key":"e_1_3_2_1_48_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_49_1","unstructured":"Laurent Perron and Vincent Furnon. 2022. OR-Tools. Google. https:\/\/developers.google.com\/optimization\/"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-77004-4_1"},{"key":"e_1_3_2_1_51_1","volume-title":"International Conference on Machine Learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_1_52_1","volume-title":"Hubs in space: Popular nearest neighbors in high-dimensional data. Journal of Machine Learning Research 11, sept","author":"Radovanovic Milos","year":"2010","unstructured":"Milos Radovanovic, Alexandros Nanopoulos, and Mirjana Ivanovic. 2010. Hubs in space: Popular nearest neighbors in high-dimensional data. Journal of Machine Learning Research 11, sept (2010), 2487\u20132531."},{"key":"e_1_3_2_1_53_1","volume-title":"A stochastic approximation method. The annals of mathematical statistics","author":"Robbins Herbert","year":"1951","unstructured":"Herbert Robbins and Sutton Monro. 1951. A stochastic approximation method. The annals of mathematical statistics (1951), 400\u2013407."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24947-6_17"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.15807\/jorsj.47.244"},{"key":"e_1_3_2_1_56_1","volume-title":"IEEE International Conference on, Vol.\u00a03. IEEE Computer Society, 1470\u20131470","author":"Sivic Josef","year":"2003","unstructured":"Josef Sivic and Andrew Zisserman. 2003. Video Google: A text retrieval approach to object matching in videos. In Computer Vision, IEEE International Conference on, Vol.\u00a03. IEEE Computer Society, 1470\u20131470."},{"key":"e_1_3_2_1_57_1","volume-title":"Concept-based video retrieval. Foundations and Trends\u00ae in Information Retrieval 2, 4","author":"Snoek GM","year":"2009","unstructured":"Cees\u00a0GM Snoek, Marcel Worring, 2009. Concept-based video retrieval. Foundations and Trends\u00ae in Information Retrieval 2, 4 (2009), 215\u2013322."},{"key":"e_1_3_2_1_58_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","unstructured":"Chengyu Wang Minghui Qiu Taolin Zhang Tingting Liu Lei Li Jianing Wang Ming Wang Jun Huang and Wei Lin. 2022. EasyNLP: A Comprehensive and Easy-to-use Toolkit for Natural Language Processing. (2022). https:\/\/doi.org\/10.48550\/ARXIV.2205.00258","DOI":"10.48550\/ARXIV.2205.00258"},{"key":"e_1_3_2_1_60_1","volume-title":"J. International Conference on Learning Representations (ICLR","author":"Welling Max","year":"2016","unstructured":"Max Welling and Thomas\u00a0N Kipf. 2016. Semi-supervised classification with graph convolutional networks. In J. International Conference on Learning Representations (ICLR 2017)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01136"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"e_1_3_2_1_66_1","volume-title":"CenterCLIP: Token Clustering for Efficient Text-Video Retrieval. arXiv preprint arXiv:2205.00823","author":"Zhao Shuai","year":"2022","unstructured":"Shuai Zhao, Linchao Zhu, Xiaohan Wang, and Yi Yang. 2022. CenterCLIP: Token Clustering for Efficient Text-Video Retrieval. arXiv preprint arXiv:2205.00823 (2022)."}],"event":{"name":"WWW '23: The ACM Web Conference 2023","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Austin TX USA","acronym":"WWW '23"},"container-title":["Proceedings of the ACM Web Conference 2023"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3543507.3583365","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,29]],"date-time":"2024-02-29T19:48:36Z","timestamp":1709236116000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3543507.3583365"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,4,30]]},"references-count":66,"alternative-id":["10.1145\/3543507.3583365","10.1145\/3543507"],"URL":"https:\/\/doi.org\/10.1145\/3543507.3583365","relation":{},"subject":[],"published":{"date-parts":[[2023,4,30]]},"assertion":[{"value":"2023-04-30","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}