{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T02:40:39Z","timestamp":1733280039023,"version":"3.30.1"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031781186","type":"print"},{"value":"9783031781193","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T00:00:00Z","timestamp":1733356800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T00:00:00Z","timestamp":1733356800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T00:00:00Z","timestamp":1733356800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T00:00:00Z","timestamp":1733356800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78119-3_6","type":"book-chapter","created":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T02:00:51Z","timestamp":1733277651000},"page":"76-90","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["ROISER: Towards Real World Semantic Entity Recognition from\u00a0Visually-Rich Documents"],"prefix":"10.1007","author":[{"given":"Zening","family":"Lin","sequence":"first","affiliation":[]},{"given":"Jiapeng","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Wenhui","family":"Liao","sequence":"additional","affiliation":[]},{"given":"Weicong","family":"Dai","sequence":"additional","affiliation":[]},{"given":"Longfei","family":"Xiong","sequence":"additional","affiliation":[]},{"given":"Lianwen","family":"Jin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,5]]},"reference":[{"key":"6_CR1","doi-asserted-by":"crossref","unstructured":"Appalaraju, S., Jasani, B., Kota, B.U., Xie, Y., Manmatha, R.: DocFormer: end-to-end transformer for document understanding. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 973\u2013983. IEEE (2021)","DOI":"10.1109\/ICCV48922.2021.00103"},{"key":"6_CR2","doi-asserted-by":"crossref","unstructured":"Cao, H., et al.: Query-driven generative network for document information extraction in the wild. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 4261\u20134271 (2022)","DOI":"10.1145\/3503161.3547877"},{"key":"6_CR3","doi-asserted-by":"crossref","unstructured":"Chi, Z., et al.: InfoXLM: an information-theoretic framework for cross-lingual language model pre-training. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 3576\u20133588 (2021)","DOI":"10.18653\/v1\/2021.naacl-main.280"},{"key":"6_CR4","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1007\/978-3-031-25069-9_19","volume-title":"Computer Vision \u2013 ECCV 2022 Workshops: Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part IV","author":"B Davis","year":"2023","unstructured":"Davis, B., Morse, B., Price, B., Tensmeyer, C., Wigington, C., Morariu, V.: End-to-end document recognition and\u00a0understanding with\u00a0dessurt. In: Karlinsky, L., Michaeli, T., Nishino, K. (eds.) Computer Vision \u2013 ECCV 2022 Workshops: Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part IV, pp. 280\u2013296. Springer Nature Switzerland, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-25069-9_19"},{"key":"6_CR5","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT, pp. 4171\u20134186 (2019)"},{"key":"6_CR6","doi-asserted-by":"crossref","unstructured":"Gu, Z., et al.: XYLayoutLM: towards layout-aware multimodal networks for visually-rich document understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4583\u20134592 (2022)","DOI":"10.1109\/CVPR52688.2022.00454"},{"key":"6_CR7","doi-asserted-by":"crossref","unstructured":"Jaume, G., Ekenel, H.K., Thiran, J.P.: FUNSD: a dataset for form understanding in noisy scanned documents. In: ICDAR-OST (2019)","DOI":"10.1109\/ICDARW.2019.10029"},{"key":"6_CR8","doi-asserted-by":"crossref","unstructured":"Guo, H., Qin, X., Liu, J., Han, J., Liu, J., Ding, E.: EATEN: entity-aware attention for single shot visual text extraction. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 254\u2013259. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00049"},{"key":"6_CR9","doi-asserted-by":"crossref","unstructured":"Ha, J., Haralick, R.M., Phillips, I.T.: Recursive X-Y cut using bounding boxes of connected components. In: Proceedings of 3rd International Conference on Document Analysis and Recognition, vol.\u00a02, pp. 952\u2013955. IEEE (1995)","DOI":"10.1109\/ICDAR.1995.602059"},{"key":"6_CR10","doi-asserted-by":"crossref","unstructured":"Hong, T., Kim, D., Ji, M., Hwang, W., Nam, D., Park, S.: BROS: a pre-trained language model focusing on text and layout for better key information extraction from documents. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36(10), pp. 10767\u201310775 (2022)","DOI":"10.1609\/aaai.v36i10.21322"},{"issue":"11","key":"6_CR11","first-page":"12899","volume":"37","author":"K Hu","year":"2023","unstructured":"Hu, K., Wu, Z., Zhong, Z., Lin, W., Sun, L., Huo, Q.: A question-answering approach to key value pair extraction from form-like document images. Proc. AAAI Conf. Artif. Intell. 37(11), 12899\u201312906 (2023)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"6_CR12","doi-asserted-by":"crossref","unstructured":"Huang, Y., Lv, T., Cui, L., Lu, Y., Wei, F.: LayoutLMv3: pre-training for document AI with unified text and image masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 4083\u20134091 (2022)","DOI":"10.1145\/3503161.3548112"},{"key":"6_CR13","doi-asserted-by":"crossref","unstructured":"Huang, Z., Chen, K., He, J., Bai, X., Karatzas, D., Lu, S., Jawahar, C.: ICDAR2019 competition on scanned receipt OCR and information extraction. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1516\u20131520. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00244"},{"key":"6_CR14","doi-asserted-by":"publisher","first-page":"498","DOI":"10.1007\/978-3-031-19815-1_29","volume-title":"Computer Vision \u2013 ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXVIII","author":"G Kim","year":"2022","unstructured":"Kim, G., et al.: OCR-free document understanding transformer. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXVIII, pp. 498\u2013517. Springer Nature Switzerland, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_29"},{"key":"6_CR15","doi-asserted-by":"crossref","unstructured":"Li, C., et al.: StructuralLM: structural pre-training for form understanding. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 6309\u20136318 (2021)","DOI":"10.18653\/v1\/2021.acl-long.493"},{"key":"6_CR16","unstructured":"Li, C., et\u00a0al.: PP-OCRv3: more attempts for the improvement of ultra lightweight OCR system. arXiv preprint arXiv:2206.03001 (2022)"},{"key":"6_CR17","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: StrucTexT: structured text understanding with multi-modal transformers. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 1912\u20131920 (2021)","DOI":"10.1145\/3474085.3475345"},{"key":"6_CR18","doi-asserted-by":"crossref","unstructured":"Liao, H., et al.: DocTr: document transformer for structured information extraction in documents. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19584\u201319594 (2023)","DOI":"10.1109\/ICCV51070.2023.01794"},{"key":"6_CR19","doi-asserted-by":"crossref","unstructured":"Lin, Z., et al.: PEneo: unifying line extraction, line grouping, and entity linking for end-to-end document pair extraction. arXiv preprint arXiv:2401.03472 (2024)","DOI":"10.1145\/3664647.3680931"},{"key":"6_CR20","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: Proceedings of the 7th International Conference on Learning Representations (ICLR) (2019)"},{"key":"6_CR21","doi-asserted-by":"crossref","unstructured":"Luo, C., Cheng, C., Zheng, Q., Yao, C.: GeoLayoutLM: geometric pre-training for visual information extraction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7092\u20137101 (2023)","DOI":"10.1109\/CVPR52729.2023.00685"},{"key":"6_CR22","unstructured":"Park, S., et al.: CORD: a consolidated receipt dataset for post-OCR parsing. In: Workshop on Document Intelligence at NeurIPS 2019 (2019)"},{"key":"6_CR23","doi-asserted-by":"crossref","unstructured":"Peng, Q., et\u00a0al.: ERNIE-Layout: Layout knowledge enhanced pre-training for visually-rich document understanding. In: Findings of the Association for Computational Linguistics: EMNLP 2022, pp. 3744\u20133756 (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.274"},{"key":"6_CR24","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1007\/978-94-017-2390-9_10","volume-title":"Natural Language Processing Using Very Large Corpora","author":"LA Ramshaw","year":"1999","unstructured":"Ramshaw, L.A., Marcus, M.P.: Text chunking using transformation-based learning. In: Armstrong, S., Church, K., Isabelle, P., Manzi, S., Tzoukermann, E., Yarowsky, D. (eds.) Natural Language Processing Using Very Large Corpora, pp. 157\u2013176. Springer Netherlands, Dordrecht (1999). https:\/\/doi.org\/10.1007\/978-94-017-2390-9_10"},{"key":"6_CR25","doi-asserted-by":"crossref","unstructured":"Shrivastava, A., Gupta, A., Girshick, R.: Training region-based object detectors with online hard example mining. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 761\u2013769 (2016)","DOI":"10.1109\/CVPR.2016.89"},{"key":"6_CR26","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"6_CR27","doi-asserted-by":"crossref","unstructured":"Wang, J., Jin, L., Ding, K.: LiLT: a simple yet effective language-independent layout transformer for structured document understanding. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 7747\u20137757 (2022)","DOI":"10.18653\/v1\/2022.acl-long.534"},{"key":"6_CR28","doi-asserted-by":"crossref","unstructured":"Wang, Z., Xu, Y., Cui, L., Shang, J., Wei, F.: LayoutReader: pre-training of text and layout for reading order detection. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 4735\u20134744 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.389"},{"key":"6_CR29","doi-asserted-by":"crossref","unstructured":"Xu, Y., et\u00a0al.: LayoutLMv2: multi-modal pre-training for visually-rich document understanding. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 2579\u20132591 (2021)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"6_CR30","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: LayoutLM: pre-training of text and layout for document image understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 1192\u20131200 (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"6_CR31","unstructured":"Xu, Y., et al.: LayoutXLM: multimodal pre-training for multilingual visually-rich document understanding (2021)"},{"key":"6_CR32","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: XFUND: a benchmark dataset for multilingual visually rich form understanding. In: Findings of the Association for Computational Linguistics: ACL 2022, pp. 3214\u20133224 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.253"},{"key":"6_CR33","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: Modeling entities as semantic points for visual information extraction in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15358\u201315367 (2023)","DOI":"10.1109\/CVPR52729.2023.01474"},{"key":"6_CR34","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: Reading order matters: information extraction from visually-rich documents by token path prediction. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 13716\u201313730 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.846"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78119-3_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T02:03:32Z","timestamp":1733277812000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78119-3_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,5]]},"ISBN":["9783031781186","9783031781193"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78119-3_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,5]]},"assertion":[{"value":"5 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}