{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T19:40:28Z","timestamp":1731008428748,"version":"3.28.0"},"publisher-location":"Cham","reference-count":57,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031732225","type":"print"},{"value":"9783031732232","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,8]],"date-time":"2024-11-08T00:00:00Z","timestamp":1731024000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,8]],"date-time":"2024-11-08T00:00:00Z","timestamp":1731024000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,8]],"date-time":"2024-11-08T00:00:00Z","timestamp":1731024000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,8]],"date-time":"2024-11-08T00:00:00Z","timestamp":1731024000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73223-2_22","type":"book-chapter","created":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T18:49:44Z","timestamp":1731005384000},"page":"399-415","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Common Sense Reasoning for\u00a0Deepfake Detection"],"prefix":"10.1007","author":[{"given":"Yue","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Ben","family":"Colman","sequence":"additional","affiliation":[]},{"given":"Xiao","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Ali","family":"Shahriyari","sequence":"additional","affiliation":[]},{"given":"Gaurav","family":"Bharaj","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,8]]},"reference":[{"key":"22_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, H., et al.: NoCaps: novel object captioning at scale. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8948\u20138957 (2019)","DOI":"10.1109\/ICCV.2019.00904"},{"key":"22_CR2","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"22_CR3","doi-asserted-by":"publisher","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14, pp. 382\u2013398. Springer (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_24","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"22_CR4","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3674\u20133683 (2018)","DOI":"10.1109\/CVPR.2018.00387"},{"key":"22_CR5","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"22_CR6","doi-asserted-by":"crossref","unstructured":"Bai, W., Liu, Y., Zhang, Z., Li, B., Hu, W.: AUNet: learning relations between action units for face forgery detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24709\u201324719 (2023)","DOI":"10.1109\/CVPR52729.2023.02367"},{"key":"22_CR7","doi-asserted-by":"crossref","unstructured":"Cao, J., Ma, C., Yao, T., Chen, S., Ding, S., Yang, X.: End-to-end reconstruction-classification learning for face forgery detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4113\u20134122 (2022)","DOI":"10.1109\/CVPR52688.2022.00408"},{"key":"22_CR8","doi-asserted-by":"crossref","unstructured":"Chollet, F.: Xception: deep learning with depthwise separable convolutions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1251\u20131258 (2017)","DOI":"10.1109\/CVPR.2017.195"},{"key":"22_CR9","doi-asserted-by":"publisher","unstructured":"Coccomini, D.A., Messina, N., Gennaro, C., Falchi, F.: Combining efficientnet and vision transformers for video deepfake detection. In: International Conference on Image Analysis and Processing, pp. 219\u2013229. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-06433-3_19","DOI":"10.1007\/978-3-031-06433-3_19"},{"key":"22_CR10","doi-asserted-by":"crossref","unstructured":"Denkowski, M., Lavie, A.: Meteor universal: language specific translation evaluation for any target language. In: Proceedings of the Ninth Workshop on Statistical Machine Translation, pp. 376\u2013380 (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"22_CR11","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"22_CR12","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"22_CR13","unstructured":"Draelos, R.L., Carin, L.: Use hirescam instead of grad-cam for faithful explanations of convolutional neural networks. arXiv preprint arXiv:2011.08891 (2020)"},{"issue":"4","key":"22_CR14","doi-asserted-by":"publisher","first-page":"11","DOI":"10.1109\/MCG.2008.79","volume":"28","author":"T Geller","year":"2008","unstructured":"Geller, T.: Overcoming the uncanny valley. IEEE Comput. Graphics Appl. 28(4), 11\u201317 (2008)","journal-title":"IEEE Comput. Graphics Appl."},{"issue":"11","key":"22_CR15","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., et al.: Generative adversarial networks. Commun. ACM 63(11), 139\u2013144 (2020)","journal-title":"Commun. ACM"},{"key":"22_CR16","doi-asserted-by":"crossref","unstructured":"Guo, X., Liu, X., Ren, Z., Grosz, S., Masi, I., Liu, X.: Hierarchical fine-grained image forgery detection and localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3155\u20133165 (2023)","DOI":"10.1109\/CVPR52729.2023.00308"},{"key":"22_CR17","doi-asserted-by":"crossref","unstructured":"Guo, Y., Zhen, C., Yan, P.: Controllable guide-space for generalizable face forgery detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 20818\u201320827 (2023)","DOI":"10.1109\/ICCV51070.2023.01903"},{"key":"22_CR18","doi-asserted-by":"crossref","unstructured":"Gupta, T., Kembhavi, A.: Visual programming: Compositional visual reasoning without training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14953\u201314962 (2023)","DOI":"10.1109\/CVPR52729.2023.01436"},{"key":"22_CR19","doi-asserted-by":"crossref","unstructured":"Haliassos, A., Mira, R., Petridis, S., Pantic, M.: Leveraging real talking faces via self-supervision for robust forgery detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14950\u201314962 (2022)","DOI":"10.1109\/CVPR52688.2022.01453"},{"key":"22_CR20","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"22_CR21","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aittala, M., Hellsten, J., Lehtinen, J., Aila, T.: Analyzing and improving the image quality of stylegan. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8110\u20138119 (2020)","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"22_CR22","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.119843","volume":"222","author":"F Khalid","year":"2023","unstructured":"Khalid, F., Javed, A., Ilyas, H., Irtaza, A., et al.: DFGNN: an interpretable and generalized graph neural network for deepfakes detection. Expert Syst. Appl. 222, 119843 (2023)","journal-title":"Expert Syst. Appl."},{"key":"22_CR23","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"22_CR24","doi-asserted-by":"crossref","unstructured":"Li, L., Bao, J., Zhang, T., Yang, H., Chen, D., Wen, F., Guo, B.: Face x-ray for more general face forgery detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5001\u20135010 (2020)","DOI":"10.1109\/CVPR42600.2020.00505"},{"key":"22_CR25","doi-asserted-by":"crossref","unstructured":"Li, Y., Yang, X., Sun, P., Qi, H., Lyu, S.: Celeb-DF: a large-scale challenging dataset for deepfake forensics. In: Proceedings of the IEEE\/Cvf Conference on Computer Vision and Pattern Recognition, pp. 3207\u20133216 (2020)","DOI":"10.1109\/CVPR42600.2020.00327"},{"key":"22_CR26","unstructured":"Lin, C.Y.: Rouge: A package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"22_CR27","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"issue":"1","key":"22_CR28","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3425780","volume":"54","author":"Y Mirsky","year":"2021","unstructured":"Mirsky, Y., Lee, W.: The creation and detection of deepfakes: a survey. ACM Comput. Surv. (CSUR) 54(1), 1\u201341 (2021)","journal-title":"ACM Comput. Surv. (CSUR)"},{"key":"22_CR29","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"22_CR30","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"22_CR31","unstructured":"Paszke, A., et\u00a0al.: PyTorch: an imperative style, high-performance deep learning library. In: Advances in neural information processing systems, vol. 32 (2019)"},{"key":"22_CR32","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical textconditional image generation with clip latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"22_CR33","unstructured":"Ricker, J., Damm, S., Holz, T., Fischer, A.: Towards the detection of diffusion model deepfakes. arXiv preprint arXiv:2210.14571 (2022)"},{"key":"22_CR34","doi-asserted-by":"crossref","unstructured":"Rossler, A., Cozzolino, D., Verdoliva, L., Riess, C., Thies, J., Nie\u00dfner, M.: Faceforensics++: learning to detect manipulated facial images. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1\u201311 (2019)","DOI":"10.1109\/ICCV.2019.00009"},{"key":"22_CR35","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"22_CR36","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-cam: Visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"22_CR37","doi-asserted-by":"publisher","unstructured":"Shao, R., Wu, T., Liu, Z.: Detecting and recovering sequential deepfake manipulation. In: European Conference on Computer Vision. pp. 712\u2013728. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-19778-9_41","DOI":"10.1007\/978-3-031-19778-9_41"},{"key":"22_CR38","doi-asserted-by":"crossref","unstructured":"Shiohara, K., Yamasaki, T.: Detecting deepfakes with self-blended images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18720\u201318729 (2022)","DOI":"10.1109\/CVPR52688.2022.01816"},{"key":"22_CR39","unstructured":"Simonyan, K., Vedaldi, A., Zisserman, A.: Visualising image classification models and saliency maps. Deep Inside Convolutional Netw. 2 (2014)"},{"key":"22_CR40","unstructured":"Sun, K., Chen, S., Yao, T., Sun, X., Ding, S., Ji, R.: Towards general visual-linguistic face forgery detection. arXiv preprint arXiv:2307.16545 (2023)"},{"key":"22_CR41","unstructured":"Sundararajan, M., Taly, A., Yan, Q.: Axiomatic attribution for deep networks. In: International Conference on Machine Learning, pp. 3319\u20133328. PMLR (2017)"},{"key":"22_CR42","unstructured":"Tan, M., Le, Q.: EfficientNet: rethinking model scaling for convolutional neural networks. In: International Conference on Machine Learning, pp. 6105\u20136114. PMLR (2019)"},{"key":"22_CR43","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1016\/j.inffus.2020.06.014","volume":"64","author":"R Tolosana","year":"2020","unstructured":"Tolosana, R., Vera-Rodriguez, R., Fierrez, J., Morales, A., Ortega-Garcia, J.: Deepfakes and beyond: a survey of face manipulation and fake detection. Inf. Fusion 64, 131\u2013148 (2020)","journal-title":"Inf. Fusion"},{"key":"22_CR44","doi-asserted-by":"crossref","unstructured":"Trinh, L., Tsang, M., Rambhatla, S., Liu, Y.: Interpretable and trustworthy deepfake detection via dynamic prototypes. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1973\u20131983 (2021)","DOI":"10.1109\/WACV48630.2021.00202"},{"key":"22_CR45","unstructured":"Turton, W., Martin, A.: How deepfakes make disinformation more real than ever. Bloomberg News (2020)"},{"key":"22_CR46","doi-asserted-by":"crossref","unstructured":"Vaccari, C., Chadwick, A.: Deepfakes and disinformation: Exploring the impact of synthetic political video on deception, uncertainty, and trust in news. Soc. Media+ Soc. 6(1), 2056305120903408 (2020)","DOI":"10.1177\/2056305120903408"},{"key":"22_CR47","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"issue":"4","key":"22_CR48","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","volume":"39","author":"O Vinyals","year":"2016","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: Lessons learned from the 2015 MSCOCO image captioning challenge. IEEE Trans. Pattern Anal. Mach. Intell. 39(4), 652\u2013663 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"22_CR49","unstructured":"Wang, Q., Bai, X., Wang, H., Qin, Z., Chen, A.: InstantID: zero-shot identity-preserving generation in seconds. arXiv preprint arXiv:2401.07519 (2024)"},{"key":"22_CR50","doi-asserted-by":"crossref","unstructured":"Yang, W., et al.: Avoid-DF: Audio-visual joint learning for detecting deepfake. IEEE Trans. Inf. Forensics Secur. 18, 2015\u20132029 (2023)","DOI":"10.1109\/TIFS.2023.3262148"},{"key":"22_CR51","unstructured":"Ye, H., Zhang, J., Liu, S., Han, X., Yang, W.: Ip-adapter: text compatible image prompt adapter for text-to-image diffusion models (2023)"},{"key":"22_CR52","doi-asserted-by":"crossref","unstructured":"Zellers, R., Bisk, Y., Farhadi, A., Choi, Y.: From recognition to cognition: visual commonsense reasoning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6720\u20136731 (2019)","DOI":"10.1109\/CVPR.2019.00688"},{"key":"22_CR53","unstructured":"Zhang, Y., Guo, Q., Kordjamshidi, P.: Navhint: Vision and language navigation agent with a hint generator. arXiv preprint arXiv:2402.02559 (2024)"},{"key":"22_CR54","unstructured":"Zhang, Y., Kordjamshidi, P.: Lovis: learning orientation and visual signals for vision and language navigation. arXiv preprint arXiv:2209.12723 (2022)"},{"key":"22_CR55","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Kordjamshidi, P.: VLN-trans: translator for the vision and language navigation agent. arXiv preprint arXiv:2302.09230 (2023)","DOI":"10.18653\/v1\/2023.acl-long.737"},{"key":"22_CR56","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"22_CR57","doi-asserted-by":"crossref","unstructured":"Zi, B., Chang, M., Chen, J., Ma, X., Jiang, Y.G.: WildDeepfake: a challenging real-world dataset for deepfake detection. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 2382\u20132390 (2020)","DOI":"10.1145\/3394171.3413769"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73223-2_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T19:07:32Z","timestamp":1731006452000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73223-2_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,8]]},"ISBN":["9783031732225","9783031732232"],"references-count":57,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73223-2_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,8]]},"assertion":[{"value":"8 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}