{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,10]],"date-time":"2025-04-10T05:14:19Z","timestamp":1744262059262,"version":"3.40.3"},"publisher-location":"Cham","reference-count":66,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031198144"},{"type":"electronic","value":"9783031198151"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19815-1_29","type":"book-chapter","created":{"date-parts":[[2022,10,19]],"date-time":"2022-10-19T23:11:54Z","timestamp":1666221114000},"page":"498-517","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":118,"title":["OCR-Free Document Understanding Transformer"],"prefix":"10.1007","author":[{"given":"Geewook","family":"Kim","sequence":"first","affiliation":[]},{"given":"Teakgyu","family":"Hong","sequence":"additional","affiliation":[]},{"given":"Moonbin","family":"Yim","sequence":"additional","affiliation":[]},{"given":"JeongYeon","family":"Nam","sequence":"additional","affiliation":[]},{"given":"Jinyoung","family":"Park","sequence":"additional","affiliation":[]},{"given":"Jinyeong","family":"Yim","sequence":"additional","affiliation":[]},{"given":"Wonseok","family":"Hwang","sequence":"additional","affiliation":[]},{"given":"Sangdoo","family":"Yun","sequence":"additional","affiliation":[]},{"given":"Dongyoon","family":"Han","sequence":"additional","affiliation":[]},{"given":"Seunghyun","family":"Park","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,20]]},"reference":[{"key":"29_CR1","doi-asserted-by":"publisher","unstructured":"Afzal, M.Z., et al.: Deepdocclassifier: document classification with deep convolutional neural network. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR), pp. 1111\u20131115 (2015). https:\/\/doi.org\/10.1109\/ICDAR.2015.7333933","DOI":"10.1109\/ICDAR.2015.7333933"},{"key":"29_CR2","doi-asserted-by":"crossref","unstructured":"Appalaraju, S., Jasani, B., Kota, B.U., Xie, Y., Manmatha, R.: Docformer: end-to-end transformer for document understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 993\u20131003, October 2021","DOI":"10.1109\/ICCV48922.2021.00103"},{"key":"29_CR3","doi-asserted-by":"crossref","unstructured":"Baek, J., et al.: What is wrong with scene text recognition model comparisons? Dataset and model analysis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), October 2019","DOI":"10.1109\/ICCV.2019.00481"},{"key":"29_CR4","doi-asserted-by":"publisher","unstructured":"Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9357\u20139366 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00959","DOI":"10.1109\/CVPR.2019.00959"},{"key":"29_CR5","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M.F., Lin, H. (eds.) Advances in Neural Information Processing Systems, vol. 33, pp. 1877\u20131901. Curran Associates, Inc. (2020). https:\/\/proceedings.neurips.cc\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"key":"29_CR6","doi-asserted-by":"publisher","unstructured":"Davis, B., Morse, B., Cohen, S., Price, B., Tensmeyer, C.: Deep visual template-free form parsing. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 134\u2013141 (2019). https:\/\/doi.org\/10.1109\/ICDAR.2019.00030","DOI":"10.1109\/ICDAR.2019.00030"},{"key":"29_CR7","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"29_CR8","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), Minneapolis, Minnesota, pp. 4171\u20134186. Association for Computational Linguistics, June 2019. https:\/\/doi.org\/10.18653\/v1\/N19-1423. https:\/\/aclanthology.org\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"29_CR9","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, 3\u20137 May 2021. OpenReview.net (2021). https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"29_CR10","unstructured":"Duong, Q., H\u00e4m\u00e4l\u00e4inen, M., Hengchen, S.: An unsupervised method for OCR post-correction and spelling normalisation for Finnish. In: Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), Sweden, Reykjavik, Iceland, 31 May\u20132 June 2021, pp. 240\u2013248. Link\u00f6ping University Electronic Press (2021). https:\/\/aclanthology.org\/2021.nodalida-main.24"},{"key":"29_CR11","unstructured":"Friedl, J.E.F.: Mastering Regular Expressions, 3 edn. O\u2019Reilly, Beijing (2006). https:\/\/www.safaribooksonline.com\/library\/view\/mastering-regular-expressions\/0596528124\/"},{"key":"29_CR12","doi-asserted-by":"publisher","unstructured":"Guo, H., Qin, X., Liu, J., Han, J., Liu, J., Ding, E.: Eaten: entity-aware attention for single shot visual text extraction. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 254\u2013259 (2019). https:\/\/doi.org\/10.1109\/ICDAR.2019.00049","DOI":"10.1109\/ICDAR.2019.00049"},{"key":"29_CR13","doi-asserted-by":"crossref","unstructured":"Gupta, A., Vedaldi, A., Zisserman, A.: Synthetic data for text localisation in natural images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2016","DOI":"10.1109\/CVPR.2016.254"},{"key":"29_CR14","doi-asserted-by":"publisher","unstructured":"Hammami, M., H\u00e9roux, P., Adam, S., d\u2019Andecy, V.P.: One-shot field spotting on colored forms using subgraph isomorphism. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR), pp. 586\u2013590 (2015). https:\/\/doi.org\/10.1109\/ICDAR.2015.7333829","DOI":"10.1109\/ICDAR.2015.7333829"},{"key":"29_CR15","doi-asserted-by":"publisher","unstructured":"Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR), pp. 991\u2013995 (2015). https:\/\/doi.org\/10.1109\/ICDAR.2015.7333910","DOI":"10.1109\/ICDAR.2015.7333910"},{"key":"29_CR16","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"29_CR17","doi-asserted-by":"publisher","unstructured":"Hong, T., Kim, D., Ji, M., Hwang, W., Nam, D., Park, S.: Bros: a pre-trained language model focusing on text and layout for better key information extraction from documents. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, no. 10, pp. 10767\u201310775, June 2022. https:\/\/doi.org\/10.1609\/aaai.v36i10.21322. https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/21322","DOI":"10.1609\/aaai.v36i10.21322"},{"key":"29_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"497","DOI":"10.1007\/978-3-319-10593-2_33","volume-title":"Computer Vision \u2013 ECCV 2014","author":"W Huang","year":"2014","unstructured":"Huang, W., Qiao, Yu., Tang, X.: Robust scene text detection with convolution neural network induced MSER trees. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8692, pp. 497\u2013511. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10593-2_33"},{"key":"29_CR19","doi-asserted-by":"publisher","unstructured":"Huang, Z., et al.: ICDAR 2019 competition on scanned receipt OCR and information extraction. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1516\u20131520 (2019). https:\/\/doi.org\/10.1109\/ICDAR.2019.00244","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"29_CR20","unstructured":"Hwang, A., Frey, W.R., McKeown, K.: Towards augmenting lexical resources for slang and African American English. In: Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects, Barcelona, Spain, pp. 160\u2013172. International Committee on Computational Linguistics (ICCL), December 2020. https:\/\/aclanthology.org\/2020.vardial-1.15"},{"key":"29_CR21","unstructured":"Hwang, W., et al.: Post-OCR parsing: building simple and robust parser via bio tagging. In: Workshop on Document Intelligence at NeurIPS 2019 (2019)"},{"key":"29_CR22","doi-asserted-by":"publisher","unstructured":"Hwang, W., Lee, H., Yim, J., Kim, G., Seo, M.: Cost-effective end-to-end information extraction for semi-structured document images. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 3375\u20133383. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic, November 2021. https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.271. https:\/\/aclanthology.org\/2021.emnlp-main.271","DOI":"10.18653\/v1\/2021.emnlp-main.271"},{"key":"29_CR23","doi-asserted-by":"publisher","unstructured":"Hwang, W., Yim, J., Park, S., Yang, S., Seo, M.: Spatial dependency parsing for semi-structured document information extraction. In: Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021, pp. 330\u2013343. Association for Computational Linguistics, August 2021. https:\/\/doi.org\/10.18653\/v1\/2021.findings-acl.28. https:\/\/aclanthology.org\/2021.findings-acl.28","DOI":"10.18653\/v1\/2021.findings-acl.28"},{"key":"29_CR24","unstructured":"Jaderberg, M., Simonyan, K., Vedaldi, A., Zisserman, A.: Synthetic data and artificial neural networks for natural scene text recognition. In: Workshop on Deep Learning, NIPS (2014)"},{"key":"29_CR25","doi-asserted-by":"crossref","unstructured":"Kang, L., Kumar, J., Ye, P., Li, Y., Doermann, D.S.: Convolutional neural networks for document image classification. In: 2014 22nd International Conference on Pattern Recognition, pp. 3168\u20133172 (2014)","DOI":"10.1109\/ICPR.2014.546"},{"key":"29_CR26","doi-asserted-by":"publisher","unstructured":"Karatzas, D., et al.: ICDAR 2015 competition on robust reading. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR), pp. 1156\u20131160 (2015). https:\/\/doi.org\/10.1109\/ICDAR.2015.7333942","DOI":"10.1109\/ICDAR.2015.7333942"},{"key":"29_CR27","unstructured":"Kim, W., Son, B., Kim, I.: ViLT: vision-and-language transformer without convolution or region supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, 18\u201324 July 2021, vol. 139, pp. 5583\u20135594. PMLR (2021). http:\/\/proceedings.mlr.press\/v139\/kim21k.html"},{"key":"29_CR28","unstructured":"Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization. In: Bengio, Y., LeCun, Y. (eds.) 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7\u20139, 2015, Conference Track Proceedings (2015), http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"29_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"451","DOI":"10.1007\/978-3-030-86549-8_29","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"S Klaiman","year":"2021","unstructured":"Klaiman, S., Lehne, M.: DocReader: bounding-box free training of a document information extraction\u00a0model. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12821, pp. 451\u2013465. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86549-8_29"},{"key":"29_CR30","doi-asserted-by":"publisher","unstructured":"Lewis, D., Agam, G., Argamon, S., Frieder, O., Grossman, D., Heard, J.: Building a test collection for complex document information processing. In: Proceedings of the 29th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR 2006, pp. 665\u2013666. Association for Computing Machinery, New York (2006). https:\/\/doi.org\/10.1145\/1148170.1148307","DOI":"10.1145\/1148170.1148307"},{"key":"29_CR31","doi-asserted-by":"publisher","unstructured":"Lewis, M., et al.: BART: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 7871\u20137880. Association for Computational Linguistics, July 2020. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.703. https:\/\/aclanthology.org\/2020.acl-main.703","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"29_CR32","doi-asserted-by":"publisher","unstructured":"Li, C., et al.: StructuralLM: structural pre-training for form understanding. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 6309\u20136318. Association for Computational Linguistics, August 2021. https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.493. https:\/\/aclanthology.org\/2021.acl-long.493","DOI":"10.18653\/v1\/2021.acl-long.493"},{"key":"29_CR33","doi-asserted-by":"publisher","unstructured":"Li, P., et al.: SelfDoc: self-supervised document representation learning. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5648\u20135656 (2021). https:\/\/doi.org\/10.1109\/CVPR46437.2021.00560","DOI":"10.1109\/CVPR46437.2021.00560"},{"key":"29_CR34","doi-asserted-by":"publisher","unstructured":"Liao, M., Shi, B., Bai, X., Wang, X., Liu, W.: Textboxes: a fast text detector with a single deep neural network. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 31, no. 1, February 2017. https:\/\/doi.org\/10.1609\/aaai.v31i1.11196. https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/11196","DOI":"10.1609\/aaai.v31i1.11196"},{"key":"29_CR35","doi-asserted-by":"publisher","unstructured":"Liu, W., Chen, C., Wong, K.Y.K., Su, Z., Han, J.: Star-net: a spatial attention residue network for scene text recognition. In: Richard C. Wilson, E.R.H., Smith, W.A.P. (eds.) Proceedings of the British Machine Vision Conference (BMVC), pp. 43.1\u201343.13. BMVA Press, September 2016. https:\/\/doi.org\/10.5244\/C.30.43","DOI":"10.5244\/C.30.43"},{"key":"29_CR36","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: Multilingual denoising pre-training for neural machine translation. Trans. Assoc. Comput. Linguist. 8, 726\u2013742 (2020). https:\/\/aclanthology.org\/2020.tacl-1.47","DOI":"10.1162\/tacl_a_00343"},{"key":"29_CR37","doi-asserted-by":"crossref","unstructured":"Liu, Y., Chen, H., Shen, C., He, T., Jin, L., Wang, L.: ABCNet: real-time scene text spotting with adaptive Bezier-curve network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), June 2020","DOI":"10.1109\/CVPR42600.2020.00983"},{"key":"29_CR38","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10012\u201310022, October 2021","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"29_CR39","unstructured":"Long, S., Yao, C.: Unrealtext: synthesizing realistic scene text images from the unreal world. arXiv preprint arXiv:2003.10608 (2020)"},{"key":"29_CR40","doi-asserted-by":"publisher","unstructured":"Majumder, B.P., Potti, N., Tata, S., Wendt, J.B., Zhao, Q., Najork, M.: Representation learning for information extraction from form-like documents. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 6495\u20136504. Association for Computational Linguistics, July 2020. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.580. https:\/\/www.aclweb.org\/anthology\/2020.acl-main.580","DOI":"10.18653\/v1\/2020.acl-main.580"},{"key":"29_CR41","doi-asserted-by":"publisher","unstructured":"Majumder, B.P., Potti, N., Tata, S., Wendt, J.B., Zhao, Q., Najork, M.: Representation learning for information extraction from form-like documents. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 6495\u20136504. Association for Computational Linguistics, July 2020. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.580. https:\/\/aclanthology.org\/2020.acl-main.580","DOI":"10.18653\/v1\/2020.acl-main.580"},{"key":"29_CR42","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.: DocVQA: a dataset for VQA on document images. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2200\u20132209 (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"29_CR43","unstructured":"Park, S., et al.: Cord: a consolidated receipt dataset for post-OCR parsing. In: Workshop on Document Intelligence at NeurIPS 2019 (2019)"},{"key":"29_CR44","unstructured":"Peng, D., et al.: SPTS: Single-Point Text Spotting. CoRR abs\/2112.07917 (2021). https:\/\/arxiv.org\/abs\/2112.07917"},{"key":"29_CR45","doi-asserted-by":"crossref","unstructured":"Phan, T.Q., Shivakumara, P., Tian, S., Tan, C.L.: Recognizing text with perspective distortion in natural scenes. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), December 2013","DOI":"10.1109\/ICCV.2013.76"},{"key":"29_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"732","DOI":"10.1007\/978-3-030-86331-9_47","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"R Powalski","year":"2021","unstructured":"Powalski, R., Borchmann, \u0141, Jurkiewicz, D., Dwojak, T., Pietruszka, M., Pa\u0142ka, G.: Going full-TILT boogie on document understanding with text-image-layout transformer. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12822, pp. 732\u2013747. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86331-9_47"},{"key":"29_CR47","doi-asserted-by":"publisher","unstructured":"Riba, P., Dutta, A., Goldmann, L., Forn\u00e9s, A., Ramos, O., Llad\u00f3, J.: Table detection in invoice documents by graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 122\u2013127 (2019). https:\/\/doi.org\/10.1109\/ICDAR.2019.00028","DOI":"10.1109\/ICDAR.2019.00028"},{"key":"29_CR48","doi-asserted-by":"publisher","unstructured":"Rijhwani, S., Anastasopoulos, A., Neubig, G.: OCR post correction for endangered language texts. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 5931\u20135942. Association for Computational Linguistics, November 2020. https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.478. https:\/\/aclanthology.org\/2020.emnlp-main.478","DOI":"10.18653\/v1\/2020.emnlp-main.478"},{"key":"29_CR49","unstructured":"Schaefer, R., Neudecker, C.: A two-step approach for automatic OCR post-correction. In: Proceedings of the The 4th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, pp. 52\u201357. International Committee on Computational Linguistics, December 2020. https:\/\/aclanthology.org\/2020.latechclfl-1.6"},{"key":"29_CR50","doi-asserted-by":"publisher","first-page":"2298","DOI":"10.1109\/TPAMI.2016.2646371","volume":"39","author":"B Shi","year":"2017","unstructured":"Shi, B., Bai, X., Yao, C.: An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE Trans. Pattern Anal. Mach. Intell. 39, 2298\u20132304 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"29_CR51","doi-asserted-by":"publisher","unstructured":"Shi, B., Wang, X., Lyu, P., Yao, C., Bai, X.: Robust scene text recognition with automatic rectification. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4168\u20134176 (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.452","DOI":"10.1109\/CVPR.2016.452"},{"key":"29_CR52","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"348","DOI":"10.1007\/11669487_31","volume-title":"Document Analysis Systems VII","author":"K Taghva","year":"2006","unstructured":"Taghva, K., Beckley, R., Coombs, J.: The effects of OCR error on the extraction of private information. In: Bunke, H., Spitz, A.L. (eds.) DAS 2006. LNCS, vol. 3872, pp. 348\u2013357. Springer, Heidelberg (2006). https:\/\/doi.org\/10.1007\/11669487_31"},{"key":"29_CR53","unstructured":"Tan, M., Le, Q.: Efficientnetv2: smaller models and faster training. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, 18\u201324 July 2021, vol. 139, pp. 10096\u201310106. PMLR (2021). https:\/\/proceedings.mlr.press\/v139\/tan21a.html"},{"key":"29_CR54","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1007\/978-3-319-46484-8_4","volume-title":"Computer Vision \u2013 ECCV 2016","author":"Z Tian","year":"2016","unstructured":"Tian, Z., Huang, W., He, T., He, P., Qiao, Y.: Detecting text in natural image with connectionist text proposal network. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 56\u201372. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_4"},{"key":"29_CR55","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"635","DOI":"10.1007\/978-3-030-86337-1_42","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"R Tito","year":"2021","unstructured":"Tito, R., Mathew, M., Jawahar, C.V., Valveny, E., Karatzas, D.: ICDAR 2021 competition on document visual question answering. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12824, pp. 635\u2013649. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86337-1_42"},{"key":"29_CR56","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Guyon, I., et al. (eds.) Advances in Neural Information Processing Systems, vol. 30. Curran Associates, Inc. (2017). https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"29_CR57","unstructured":"Wang, J., Hu, X.: Gated recurrent convolution neural network for OCR. In: Guyon, I., et al. (eds.) Advances in Neural Information Processing Systems, vol. 30. Curran Associates, Inc. (2017). https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/c24cd76e1ce41366a4bbe8a49b02a028-Paper.pdf"},{"key":"29_CR58","unstructured":"Wang, S., Li, B., Khabsa, M., Fang, H., Ma, H.: Linformer: self-attention with linear complexity. arXiv preprint arXiv:2006.04768 (2020)"},{"issue":"2","key":"29_CR59","doi-asserted-by":"publisher","first-page":"270","DOI":"10.1162\/neco.1989.1.2.270","volume":"1","author":"RJ Williams","year":"1989","unstructured":"Williams, R.J., Zipser, D.: A learning algorithm for continually running fully recurrent neural networks. Neural Comput. 1(2), 270\u2013280 (1989)","journal-title":"Neural Comput."},{"key":"29_CR60","doi-asserted-by":"publisher","unstructured":"Xu, Y., et al.: LayoutLMv2: multi-modal pre-training for visually-rich document understanding. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 2579\u20132591. Association for Computational Linguistics, August 2021. https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.201. https:\/\/aclanthology.org\/2021.acl-long.201","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"29_CR61","doi-asserted-by":"publisher","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: pre-training of text and layout for document image understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, KDD 2020, pp. 1192\u20131200. Association for Computing Machinery, New York (2020). https:\/\/doi.org\/10.1145\/3394486.3403172","DOI":"10.1145\/3394486.3403172"},{"key":"29_CR62","unstructured":"Xu, Y., et al.: Layoutxlm: multimodal pre-training for multilingual visually-rich document understanding. arXiv preprint arXiv:2104.08836 (2021)"},{"key":"29_CR63","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1007\/978-3-030-86337-1_8","volume-title":"Document Analysis and Recognition \u2013 ICDAR 2021","author":"M Yim","year":"2021","unstructured":"Yim, M., Kim, Y., Cho, H.-C., Park, S.: SynthTIGER: synthetic text image GEneratoR towards better text recognition models. In: Llad\u00f3s, J., Lopresti, D., Uchida, S. (eds.) ICDAR 2021. LNCS, vol. 12824, pp. 109\u2013124. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86337-1_8"},{"key":"29_CR64","doi-asserted-by":"publisher","unstructured":"Zhang, K., Shasha, D.: Simple fast algorithms for the editing distance between trees and related problems. SIAM J. Comput. 18, 1245\u20131262 (1989). https:\/\/doi.org\/10.1137\/0218082","DOI":"10.1137\/0218082"},{"key":"29_CR65","doi-asserted-by":"publisher","unstructured":"Zhang, Z., Zhang, C., Shen, W., Yao, C., Liu, W., Bai, X.: Multi-oriented text detection with fully convolutional networks. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4159\u20134167 (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.451","DOI":"10.1109\/CVPR.2016.451"},{"key":"29_CR66","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"564","DOI":"10.1007\/978-3-030-58589-1_34","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Zhong","year":"2020","unstructured":"Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12366, pp. 564\u2013580. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58589-1_34"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19815-1_29","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,21]],"date-time":"2022-10-21T23:27:58Z","timestamp":1666394878000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19815-1_29"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198144","9783031198151"],"references-count":66,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19815-1_29","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"20 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}