{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T02:01:42Z","timestamp":1740103302942,"version":"3.37.3"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,21]]},"DOI":"10.1145\/3627673.3680011","type":"proceedings-article","created":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T19:34:11Z","timestamp":1729452851000},"page":"4382-4389","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Personalized Video Summarization by Multimodal Video Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9025-0107","authenticated-orcid":false,"given":"Brian","family":"Chen","sequence":"first","affiliation":[{"name":"VDIL, Samsung Research America, Irvine, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8220-6347","authenticated-orcid":false,"given":"Xiangyuan","family":"Zhao","sequence":"additional","affiliation":[{"name":"VDIL, Samsung Research America, Irvine, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5709-1335","authenticated-orcid":false,"given":"Yingnan","family":"Zhu","sequence":"additional","affiliation":[{"name":"VDIL, Samsung Research America, irvine, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,10,21]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3037883"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512527.3531404"},{"key":"e_1_3_2_1_3_1","volume-title":"Inter-annotator agreement. Handbook of linguistic annotation","author":"Artstein Ron","year":"2017","unstructured":"Ron Artstein. 2017. Inter-annotator agreement. Handbook of linguistic annotation (2017), 297--313."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the Asian Conference on Computer Vision.","author":"Bain Max","year":"2020","unstructured":"Max Bain, Arsha Nagrani, Andrew Brown, and Andrew Zisserman. 2020. Condensed movies: Story based retrieval with contextual embeddings. In Proceedings of the Asian Conference on Computer Vision."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00212"},{"key":"e_1_3_2_1_6_1","volume-title":"Moviescope: Large-scale Analysis of Movies using Multiple Modalities. ArXiv","author":"Cascante-Bonilla Paola","year":"2019","unstructured":"Paola Cascante-Bonilla, Kalpathy Sitaraman, Mengjia Luo, and Vicente Ordonez. 2019. Moviescope: Large-scale Analysis of Movies using Multiple Modalities. ArXiv, Vol. abs\/1908.03180 (2019)."},{"volume-title":"Creating summaries from user videos","author":"Gygli Michael","key":"e_1_3_2_1_7_1","unstructured":"Michael Gygli, Helmut Grabner, Hayko Riemenschneider, and Luc Van Gool. 2014. Creating summaries from user videos. In ECCV. Springer, 505--520."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372278.3390695"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Hao Jiang and Yadong Mu. 2022. Joint Video Summarization and Moment Localization by Cross-Task Sample Transfer. In CVPR. 16388--16398.","DOI":"10.1109\/CVPR52688.2022.01590"},{"key":"e_1_3_2_1_10_1","first-page":"11846","article-title":"Detecting moments and highlights in videos via natural language queries","volume":"34","author":"Lei Jie","year":"2021","unstructured":"Jie Lei, Tamara L Berg, and Mohit Bansal. 2021. Detecting moments and highlights in videos via natural language queries. Advances in Neural Information Processing Systems, Vol. 34 (2021), 11846--11858.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings, Part V 13","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part V 13. Springer, 740--755."},{"key":"e_1_3_2_1_14_1","unstructured":"J\u00e9r\u00f4me Louradour. 2023. whisper-timestamped. https:\/\/github.com\/linto-ai\/whisper-timestamped."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Behrooz Mahasseni Michael Lam and Sinisa Todorovic. 2017. Unsupervised video summarization with adversarial lstm networks. In CVPR. 202--211.","DOI":"10.1109\/CVPR.2017.318"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Saiteja Nalla Mohit Agrawal Vishal Kaushal Ganesh Ramakrishnan and Rishabh Iyer. 2020. Watch Hours in Minutes: Summarizing Videos with User Intent. In ECCV.","DOI":"10.1007\/978-3-030-68238-5_47"},{"key":"e_1_3_2_1_19_1","unstructured":"Medhini Narasimhan Anna Rohrbach and Trevor Darrell. 2021. CLIP-It! language-guided video summarization. NeurIPS 13988--14000."},{"volume-title":"The adaptive web: methods and strategies of web personalization","author":"Pazzani Michael J","key":"e_1_3_2_1_20_1","unstructured":"Michael J Pazzani and Daniel Billsus. 2007. Content-based recommendation systems. In The adaptive web: methods and strategies of web personalization. Springer, 325--341."},{"key":"e_1_3_2_1_21_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. CoRR, Vol. abs\/2103.00020 (2021). showeprint[arXiv]2103.00020 https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_2_1_22_1","first-page":"1","article-title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2020. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. Journal of Machine Learning Research, Vol. 21, 140 (2020), 1--67. http:\/\/jmlr.org\/papers\/v21\/20-074.html","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Aidean Sharghi Jacob S Laurel and Boqing Gong. 2017. Query-focused video summarization: Dataset evaluation and a memory network based approach. In CVPR.","DOI":"10.1109\/CVPR.2017.229"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 5179--5187","author":"Song Yale","year":"2015","unstructured":"Yale Song, Jordi Vallmitjana, Amanda Stent, and Alejandro Jaimes. 2015. Tvsum: Summarizing web videos using titles. In Proceedings of the IEEE conference on computer vision and pattern recognition. 5179--5187."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Arun Balajee Vasudevan Michael Gygli Anna Volokitin and Luc Van Gool. 2017. Query-adaptive video summarization via quality-aware relevance estimation. In ACM MM.","DOI":"10.1145\/3123266.3123297"},{"key":"e_1_3_2_1_26_1","unstructured":"Guande Wu Jianzhe Lin and Claudio T Silva. 2022. IntentVizor: Towards Generic Query Guided Interactive Video Summarization. In CVPR. 10503--10512."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2985868"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6929"},{"key":"e_1_3_2_1_29_1","unstructured":"Andy Zeng Maria Attarian Brian Ichter Krzysztof Choromanski Adrian Wong Stefan Welker Federico Tombari Aveek Purohit Michael Ryoo Vikas Sindhwani et al. 2022. Socratic models: Composing zero-shot multimodal reasoning with language. arXiv preprint arXiv:2204.00598 (2022)."}],"event":{"name":"CIKM '24: The 33rd ACM International Conference on Information and Knowledge Management","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Boise ID USA","acronym":"CIKM '24"},"container-title":["Proceedings of the 33rd ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3627673.3680011","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,21]],"date-time":"2024-10-21T15:45:29Z","timestamp":1729525529000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627673.3680011"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,21]]},"references-count":29,"alternative-id":["10.1145\/3627673.3680011","10.1145\/3627673"],"URL":"https:\/\/doi.org\/10.1145\/3627673.3680011","relation":{},"subject":[],"published":{"date-parts":[[2024,10,21]]},"assertion":[{"value":"2024-10-21","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}