{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,22]],"date-time":"2025-03-22T12:30:32Z","timestamp":1742646632680,"version":"3.37.3"},"publisher-location":"New York, NY, USA","reference-count":85,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,11]]},"DOI":"10.1145\/3613904.3642443","type":"proceedings-article","created":{"date-parts":[[2024,5,11]],"date-time":"2024-05-11T08:38:25Z","timestamp":1715416705000},"page":"1-17","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["TutoAI: a cross-domain framework for AI-assisted mixed-media tutorial creation on physical tasks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8720-5516","authenticated-orcid":false,"given":"Yuexi","family":"Chen","sequence":"first","affiliation":[{"name":"University of Maryland, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7937-7748","authenticated-orcid":false,"given":"Vlad I","family":"Morariu","sequence":"additional","affiliation":[{"name":"Adobe Research, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5409-7287","authenticated-orcid":false,"given":"Anh","family":"Truong","sequence":"additional","affiliation":[{"name":"Adobe Research, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1015-2759","authenticated-orcid":false,"given":"Zhicheng","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Maryland, United States"}]}],"member":"320","published-online":{"date-parts":[[2024,5,11]]},"reference":[{"unstructured":"Meta AI. 2022. Video Summarization. https:\/\/paperswithcode.com\/task\/video-summarization","key":"e_1_3_3_3_1_1"},{"unstructured":"Meta AI. 2022. Video Summarization. https:\/\/paperswithcode.com\/task\/part-of-speech-tagging","key":"e_1_3_3_3_2_1"},{"key":"e_1_3_3_3_3_1","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics (demonstrations). 54\u201359","author":"Akbik Alan","year":"2019","unstructured":"Alan Akbik, Tanja Bergmann, Duncan Blythe, Kashif Rasul, Stefan Schweter, and Roland Vollgraf. 2019. FLAIR: An easy-to-use framework for state-of-the-art NLP. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics (demonstrations). 54\u201359."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_4_1","DOI":"10.1145\/3290605.3300233"},{"key":"e_1_3_3_3_5_1","volume-title":"Thinking aloud: Dynamic context generation improves zero-shot reasoning performance of gpt-2. arXiv preprint arXiv:2103.13033","author":"Betz Gregor","year":"2021","unstructured":"Gregor Betz, Kyle Richardson, and Christian Voigt. 2021. Thinking aloud: Dynamic context generation improves zero-shot reasoning performance of gpt-2. arXiv preprint arXiv:2103.13033 (2021)."},{"unstructured":"David Bitan. 2022. How to Repair a Leaking Roof. https:\/\/www.wikihow.com\/Repair-a-Leaking-Roof","key":"e_1_3_3_3_6_1"},{"key":"e_1_3_3_3_7_1","volume-title":"Notational systems\u2013the cognitive dimensions of notations framework. HCI models, theories, and frameworks: toward an interdisciplinary science. Morgan Kaufmann 234","author":"Blackwell Alan","year":"2003","unstructured":"Alan Blackwell and Thomas Green. 2003. Notational systems\u2013the cognitive dimensions of notations framework. HCI models, theories, and frameworks: toward an interdisciplinary science. Morgan Kaufmann 234 (2003)."},{"key":"e_1_3_3_3_8_1","volume-title":"On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258","author":"Bommasani Rishi","year":"2021","unstructured":"Rishi Bommasani, Drew\u00a0A Hudson, Ehsan Adeli, Russ Altman, Simran Arora, Sydney von Arx, Michael\u00a0S Bernstein, Jeannette Bohg, Antoine Bosselut, Emma Brunskill, 2021. On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258 (2021)."},{"key":"e_1_3_3_3_9_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems 33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020), 1877\u20131901."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_10_1","DOI":"10.1145\/3290605.3300234"},{"key":"e_1_3_3_3_11_1","volume-title":"A unifying reference framework for multi-target user interfaces. Interacting with computers 15, 3","author":"Calvary Ga\u00eblle","year":"2003","unstructured":"Ga\u00eblle Calvary, Jo\u00eblle Coutaz, David Thevenin, Quentin Limbourg, Laurent Bouillon, and Jean Vanderdonckt. 2003. A unifying reference framework for multi-target user interfaces. Interacting with computers 15, 3 (2003), 289\u2013308."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_12_1","DOI":"10.1145\/3173574.3174025"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_13_1","DOI":"10.1145\/3411764.3445131"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_14_1","DOI":"10.1145\/2380116.2380130"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_15_1","DOI":"10.1109\/ICCV51070.2023.00245"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_16_1","DOI":"10.1145\/3526113.3545672"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_17_1","DOI":"10.1613\/jair.1523"},{"key":"e_1_3_3_3_18_1","volume-title":"Retrieved","author":"Face Hugging","year":"2022","unstructured":"Hugging Face. 2022. Hugging Face Transformers: OWL-ViT. Retrieved December 22, 2022 from https:\/\/huggingface.co\/docs\/transformers\/model_doc\/owlvit"},{"key":"e_1_3_3_3_19_1","first-page":"26183","article-title":"You only look at one sequence: Rethinking transformer in vision through object detection","volume":"34","author":"Fang Yuxin","year":"2021","unstructured":"Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, and Wenyu Liu. 2021. You only look at one sequence: Rethinking transformer in vision through object detection. Advances in Neural Information Processing Systems 34 (2021), 26183\u201326197.","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_20_1","DOI":"10.1145\/3313831.3376437"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_21_1","DOI":"10.1109\/ICCV.2017.563"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_22_1","DOI":"10.1145\/3532106.3533533"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_23_1","DOI":"10.1145\/3517428.3544819"},{"key":"e_1_3_3_3_24_1","volume-title":"News summarization and evaluation in the era of gpt-3. arXiv preprint arXiv:2209.12356","author":"Goyal Tanya","year":"2022","unstructured":"Tanya Goyal, Junyi\u00a0Jessy Li, and Greg Durrett. 2022. News summarization and evaluation in the era of gpt-3. arXiv preprint arXiv:2209.12356 (2022)."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_25_1","DOI":"10.1145\/3290605.3300854"},{"key":"e_1_3_3_3_26_1","volume-title":"Align and Attend: Multimodal Summarization with Dual Contrastive Losses. arXiv preprint arXiv:2303.07284","author":"He Bo","year":"2023","unstructured":"Bo He, Jun Wang, Jielin Qiu, Trung Bui, Abhinav Shrivastava, and Zhaowen Wang. 2023. Align and Attend: Multimodal Summarization with Dual Contrastive Losses. arXiv preprint arXiv:2303.07284 (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_27_1","DOI":"10.1145\/3290605.3300523"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_28_1","DOI":"10.1145\/302979.303030"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_29_1","DOI":"10.1109\/CVPR.2016.494"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_30_1","DOI":"10.1109\/ICCV48922.2021.00180"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_31_1","DOI":"10.1145\/2642918.2647389"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_32_1","DOI":"10.1145\/2556288.2556986"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_33_1","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_3_3_34_1","volume-title":"Building Real-World Meeting Summarization Systems using Large Language Models: A Practical Perspective. arXiv preprint arXiv:2310","author":"Laskar Tahmid\u00a0Rahman","year":"2023","unstructured":"Md\u00a0Tahmid\u00a0Rahman Laskar, Xue-Yong Fu, Cheng Chen, and Shashi\u00a0Bhushan TN. 2023. Building Real-World Meeting Summarization Systems using Large Language Models: A Practical Perspective. arXiv preprint arXiv:2310.19233 (2023)."},{"key":"e_1_3_3_3_35_1","volume-title":"Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461","author":"Lewis Mike","year":"2019","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer. 2019. Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461 (2019)."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_36_1","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_3_3_37_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_38_1","DOI":"10.1007\/978-3-319-10602-1_48"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_39_1","DOI":"10.1145\/3173574.3173961"},{"key":"e_1_3_3_3_40_1","volume-title":"GPTEval: NLG Evaluation using GPT-4 with Better Human Alignment. arXiv preprint arXiv:2303.16634","author":"Liu Yang","year":"2023","unstructured":"Yang Liu, Dan Iter, Yichong Xu, Shuohang Wang, Ruochen Xu, and Chenguang Zhu. 2023. GPTEval: NLG Evaluation using GPT-4 with Better Human Alignment. arXiv preprint arXiv:2303.16634 (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_41_1","DOI":"10.1145\/3313831.3376739"},{"key":"e_1_3_3_3_42_1","volume-title":"Fantastically ordered prompts and where to find them: Overcoming few-shot prompt order sensitivity. arXiv preprint arXiv:2104.08786","author":"Lu Yao","year":"2021","unstructured":"Yao Lu, Max Bartolo, Alastair Moore, Sebastian Riedel, and Pontus Stenetorp. 2021. Fantastically ordered prompts and where to find them: Overcoming few-shot prompt order sensitivity. arXiv preprint arXiv:2104.08786 (2021)."},{"key":"e_1_3_3_3_43_1","volume-title":"Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models. arXiv preprint arXiv:2306.05424","author":"Maaz Muhammad","year":"2023","unstructured":"Muhammad Maaz, Hanoona Rasheed, Salman Khan, and Fahad\u00a0Shahbaz Khan. 2023. Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models. arXiv preprint arXiv:2306.05424 (2023)."},{"unstructured":"Dotdash Meredith. 2023. Allrecipes. https:\/\/www.allrecipes.com\/","key":"e_1_3_3_3_44_1"},{"volume-title":"Retrieved","year":"2021","unstructured":"Midjourney. 2021. Midjourney. Retrieved December 19, 2022 from https:\/\/www.midjourney.com\/","key":"e_1_3_3_3_45_1"},{"key":"e_1_3_3_3_46_1","volume-title":"Proceedings of the 2004 conference on empirical methods in natural language processing. 404\u2013411","author":"Mihalcea Rada","year":"2004","unstructured":"Rada Mihalcea and Paul Tarau. 2004. Textrank: Bringing order into text. In Proceedings of the 2004 conference on empirical methods in natural language processing. 404\u2013411."},{"key":"e_1_3_3_3_47_1","volume-title":"Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab","author":"Minderer Matthias","year":"2022","unstructured":"Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, 2022. Simple Open-Vocabulary Object Detection with Vision Transformers. arXiv preprint arXiv:2205.06230 (2022)."},{"key":"e_1_3_3_3_48_1","first-page":"13988","article-title":"Clip-it! language-guided video summarization","volume":"34","author":"Narasimhan Medhini","year":"2021","unstructured":"Medhini Narasimhan, Anna Rohrbach, and Trevor Darrell. 2021. Clip-it! language-guided video summarization. Advances in Neural Information Processing Systems 34 (2021), 13988\u201314000.","journal-title":"Advances in Neural Information Processing Systems"},{"unstructured":"Megha Nawhal Jacqueline\u00a0B Lang Greg Mori and Parmit\u00a0K Chilana. 2019. VideoWhiz: Non-Linear Interactive Overviews for Recipe Videos.. In Graphics Interface. 15\u20131.","key":"e_1_3_3_3_49_1"},{"unstructured":"OpenAI. 2022. Introducing ChatGPT. https:\/\/openai.com\/blog\/chatgpt","key":"e_1_3_3_3_50_1"},{"unstructured":"OpenAI. 2023. GPT-4V(ision) System Card. https:\/\/openai.com\/research\/gpt-4v-system-card","key":"e_1_3_3_3_51_1"},{"key":"e_1_3_3_3_52_1","volume-title":"Training language models to follow instructions with human feedback. arXiv preprint arXiv:2203.02155","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll\u00a0L Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, 2022. Training language models to follow instructions with human feedback. arXiv preprint arXiv:2203.02155 (2022)."},{"doi-asserted-by":"crossref","unstructured":"Amy Pavel Colorado Reed Bj\u00f6rn Hartmann and Maneesh Agrawala. 2014. Video digests: a browsable skimmable format for informational lecture videos.. In UIST Vol.\u00a010. Citeseer 2642918\u20132647400.","key":"e_1_3_3_3_53_1","DOI":"10.1145\/2642918.2647400"},{"key":"e_1_3_3_3_54_1","volume-title":"Retrieved","author":"Perez Sarah","year":"2020","unstructured":"Sarah Perez. 2020. YouTube introduces Video Chapters to make it easier to navigate longer videos. Retrieved October 18, 2022 from https:\/\/techcrunch.com\/2020\/05\/28\/youtube-introduces-video-chapters-to-make-it-easier-to-navigate-through-longer-videos\/?guccounter=1"},{"key":"e_1_3_3_3_55_1","volume-title":"Retrieved","author":"Prompting Learn","year":"2023","unstructured":"Learn Prompting. 2023. Your Guide to Communicating with Artificial Intelligence. Retrieved November 14, 2023 from https:\/\/learnprompting.org\/"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_56_1","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_3_3_57_1","volume-title":"International Conference on Machine Learning. PMLR, 8821\u20138831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International Conference on Machine Learning. PMLR, 8821\u20138831."},{"key":"e_1_3_3_3_58_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_59_1","DOI":"10.1109\/WACV48630.2021.00112"},{"key":"e_1_3_3_3_60_1","volume-title":"LUSE: Using LLMs for Unsupervised Step Extraction in Instructional Videos. https:\/\/cveu.github.io\/2023\/papers\/36.pdf","author":"Shang Chuyi","year":"2023","unstructured":"Chuyi Shang, Emi Tran, Medhini Narasimhan, Sanjay Subramanian, Dan Klein, and Trevor Darrell. 2023. LUSE: Using LLMs for Unsupervised Step Extraction in Instructional Videos. https:\/\/cveu.github.io\/2023\/papers\/36.pdf (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_61_1","DOI":"10.1145\/3172944.3172965"},{"key":"e_1_3_3_3_62_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 5179\u20135187","author":"Song Yale","year":"2015","unstructured":"Yale Song, Jordi Vallmitjana, Amanda Stent, and Alejandro Jaimes. 2015. Tvsum: Summarizing web videos using titles. In Proceedings of the IEEE conference on computer vision and pattern recognition. 5179\u20135187."},{"key":"e_1_3_3_3_63_1","volume-title":"Transnet V2: an effective deep network architecture for fast shot transition detection. arXiv preprint arXiv:2008.04838","author":"Sou\u010dek Tom\u00e1\u0161","year":"2020","unstructured":"Tom\u00e1\u0161 Sou\u010dek and Jakub Loko\u010d. 2020. Transnet V2: an effective deep network architecture for fast shot transition detection. arXiv preprint arXiv:2008.04838 (2020)."},{"key":"e_1_3_3_3_64_1","volume-title":"Lamda: Language models for dialog applications. arXiv preprint arXiv:2201.08239","author":"Thoppilan Romal","year":"2022","unstructured":"Romal Thoppilan, Daniel De\u00a0Freitas, Jamie Hall, Noam Shazeer, Apoorv Kulshreshtha, Heng-Tze Cheng, Alicia Jin, Taylor Bos, Leslie Baker, Yu Du, 2022. Lamda: Language models for dialog applications. arXiv preprint arXiv:2201.08239 (2022)."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_65_1","DOI":"10.1145\/3411764.3445721"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_66_1","DOI":"10.1145\/3313831.3376759"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_67_1","DOI":"10.1145\/3411764.3445162"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_68_1","DOI":"10.1145\/2556288.2557407"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_69_1","DOI":"10.1109\/ICCV48922.2021.00677"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_70_1","DOI":"10.1145\/2675133.2675219"},{"unstructured":"wikihow. 2023. Welcome to wikiHow the most trusted how-to site on the internet. https:\/\/www.wikihow.com","key":"e_1_3_3_3_71_1"},{"key":"e_1_3_3_3_72_1","volume-title":"The Free Encyclopedia. https:\/\/en.wikipedia.org\/w\/index.php?title=Wizard_(software)&oldid=1182151261 [Online","author":"Wikipedia Wikipedia","year":"2023","unstructured":"Wikipedia contributors. 2023. Wizard (software) \u2014 Wikipedia, The Free Encyclopedia. https:\/\/en.wikipedia.org\/w\/index.php?title=Wizard_(software)&oldid=1182151261 [Online; accessed 21-November-2023]."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_73_1","DOI":"10.1145\/3491102.3517582"},{"key":"e_1_3_3_3_74_1","volume-title":"Improving Video Interfaces by Presenting Informational Units of Videos. CHI\u201922 Extended Abstracts","author":"Yang Saelyne","year":"2022","unstructured":"Saelyne Yang, Sangkyung Kwak, Tae\u00a0Soo Kim, and Juho Kim. 2022. Improving Video Interfaces by Presenting Informational Units of Videos. CHI\u201922 Extended Abstracts. Association for Computing Machinery (2022)."},{"key":"e_1_3_3_3_75_1","volume-title":"Exploring the limits of chatgpt for query or aspect-based text summarization. arXiv preprint arXiv:2302.08081","author":"Yang Xianjun","year":"2023","unstructured":"Xianjun Yang, Yan Li, Xinlu Zhang, Haifeng Chen, and Wei Cheng. 2023. Exploring the limits of chatgpt for query or aspect-based text summarization. arXiv preprint arXiv:2302.08081 (2023)."},{"unstructured":"YouTube. 2023. YouTube. https:\/\/www.youtube.com\/","key":"e_1_3_3_3_76_1"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_77_1","DOI":"10.1109\/CVPR.2019.00134"},{"key":"e_1_3_3_3_78_1","volume-title":"Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."},{"key":"e_1_3_3_3_79_1","volume-title":"Benchmarking large language models for news summarization. arXiv preprint arXiv:2301.13848","author":"Zhang Tianyi","year":"2023","unstructured":"Tianyi Zhang, Faisal Ladhak, Esin Durmus, Percy Liang, Kathleen McKeown, and Tatsunori\u00a0B Hashimoto. 2023. Benchmarking large language models for news summarization. arXiv preprint arXiv:2301.13848 (2023)."},{"key":"e_1_3_3_3_80_1","volume-title":"Siren\u2019s Song in the AI Ocean: A Survey on Hallucination in Large Language Models. arXiv preprint arXiv:2309.01219","author":"Zhang Yue","year":"2023","unstructured":"Yue Zhang, Yafu Li, Leyang Cui, Deng Cai, Lemao Liu, Tingchen Fu, Xinting Huang, Enbo Zhao, Yu Zhang, Yulong Chen, 2023. Siren\u2019s Song in the AI Ocean: A Survey on Hallucination in Large Language Models. arXiv preprint arXiv:2309.01219 (2023)."},{"key":"e_1_3_3_3_81_1","volume-title":"Understanding Voice Control of Instructional Videos in Everyday Tasks. In CHI Conference on Human Factors in Computing Systems. 1\u201311","author":"Zhao Yaxi","year":"2022","unstructured":"Yaxi Zhao, Razan Jaber, Donald McMillan, and Cosmin Munteanu. 2022. \u201cRewind to the Jiggling Meat Part\u201d: Understanding Voice Control of Instructional Videos in Everyday Tasks. In CHI Conference on Human Factors in Computing Systems. 1\u201311."},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_82_1","DOI":"10.1609\/aaai.v32i1.12342"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_83_1","DOI":"10.1109\/CVPR.2018.00911"},{"doi-asserted-by":"publisher","key":"e_1_3_3_3_84_1","DOI":"10.1109\/TIP.2020.3039886"},{"key":"e_1_3_3_3_85_1","volume-title":"End-to-end dense video captioning as sequence generation. arXiv preprint arXiv:2204.08121","author":"Zhu Wanrong","year":"2022","unstructured":"Wanrong Zhu, Bo Pang, Ashish\u00a0V Thapliyal, William\u00a0Yang Wang, and Radu Soricut. 2022. End-to-end dense video captioning as sequence generation. arXiv preprint arXiv:2204.08121 (2022)."}],"event":{"sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGACCESS ACM Special Interest Group on Accessible Computing"],"acronym":"CHI '24","name":"CHI '24: CHI Conference on Human Factors in Computing Systems","location":"Honolulu HI USA"},"container-title":["Proceedings of the CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3613904.3642443","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,19]],"date-time":"2024-09-19T12:55:05Z","timestamp":1726750505000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613904.3642443"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,11]]},"references-count":85,"alternative-id":["10.1145\/3613904.3642443","10.1145\/3613904"],"URL":"https:\/\/doi.org\/10.1145\/3613904.3642443","relation":{},"subject":[],"published":{"date-parts":[[2024,5,11]]},"assertion":[{"value":"2024-05-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}