{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,23]],"date-time":"2025-04-23T16:09:37Z","timestamp":1745424577542,"version":"3.37.3"},"reference-count":43,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,6,1]],"date-time":"2024-06-01T00:00:00Z","timestamp":1717200000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2024,6,1]],"date-time":"2024-06-01T00:00:00Z","timestamp":1717200000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2024,6,1]],"date-time":"2024-06-01T00:00:00Z","timestamp":1717200000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2024,6,1]],"date-time":"2024-06-01T00:00:00Z","timestamp":1717200000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2024,6,1]],"date-time":"2024-06-01T00:00:00Z","timestamp":1717200000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,1]],"date-time":"2024-06-01T00:00:00Z","timestamp":1717200000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62225605","U20A20222"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013317","name":"Shanxi Provincial Key Research and Development Project","doi-asserted-by":"publisher","award":["2023C03196"],"id":[{"id":"10.13039\/501100013317","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100018807","name":"Ng Teng Fong Charitable Foundation","doi-asserted-by":"publisher","award":["188170-11102"],"id":[{"id":"10.13039\/501100018807","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100022963","name":"Key Research and Development Program of Zhejiang Province","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100022963","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2024,6]]},"DOI":"10.1016\/j.patcog.2024.110314","type":"journal-article","created":{"date-parts":[[2024,2,2]],"date-time":"2024-02-02T17:11:08Z","timestamp":1706893868000},"page":"110314","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":2,"special_numbering":"C","title":["Reading order detection in visually-rich documents with multi-modal layout-aware relation prediction"],"prefix":"10.1016","volume":"150","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4464-4644","authenticated-orcid":false,"given":"Liang","family":"Qiao","sequence":"first","affiliation":[]},{"given":"Can","family":"Li","sequence":"additional","affiliation":[]},{"given":"Zhanzhan","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Yunlu","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Yi","family":"Niu","sequence":"additional","affiliation":[]},{"given":"Xi","family":"Li","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2024.110314_b1","first-page":"85","article-title":"An end-to-end OCR text re-organization sequence learning for rich-text detail image comprehension","volume":"vol. 12370","author":"Li","year":"2020"},{"key":"10.1016\/j.patcog.2024.110314_b2","unstructured":"J. Devlin, M. Chang, K. Lee, K. Toutanova, BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, in: NAACL-HLT, 2019, pp. 4171\u20134186."},{"key":"10.1016\/j.patcog.2024.110314_b3","doi-asserted-by":"crossref","unstructured":"Y. Xu, Y. Xu, T. Lv, L. Cui, F. Wei, G. Wang, Y. Lu, D.A.F. Flor\u00eancio, C. Zhang, W. Che, M. Zhang, L. Zhou, LayoutLMv2: Multi-modal Pre-training for Visually-rich Document Understanding, in: ACL\/IJCNLP, 2021, pp. 2579\u20132591.","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"10.1016\/j.patcog.2024.110314_b4","doi-asserted-by":"crossref","unstructured":"P. Zhang, Y. Xu, Z. Cheng, S. Pu, J. Lu, L. Qiao, Y. Niu, F. Wu, TRIE: End-to-End Text Reading and Information Extraction for Document Understanding, in: ACM MM, 2020, pp. 1413\u20131422.","DOI":"10.1145\/3394171.3413900"},{"key":"10.1016\/j.patcog.2024.110314_b5","doi-asserted-by":"crossref","unstructured":"Z. Gu, C. Meng, K. Wang, J. Lan, W. Wang, M. Gu, L. Zhang, XYLayoutLM: Towards Layout-Aware Multimodal Networks For Visually-Rich Document Understanding, in: CVPR, 2022, pp. 4573\u20134582.","DOI":"10.1109\/CVPR52688.2022.00454"},{"key":"10.1016\/j.patcog.2024.110314_b6","doi-asserted-by":"crossref","unstructured":"M. Mathew, D. Karatzas, C.V. Jawahar, DocVQA: A Dataset for VQA on Document Images, in: WACV, 2021, pp. 2199\u20132208.","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"10.1016\/j.patcog.2024.110314_b7","doi-asserted-by":"crossref","unstructured":"C. Fang, J. Li, L. Li, C. Ma, D. Hu, Separate and Locate: Rethink the Text in Text-based Visual Question Answering, in: ACM MM, 2023, pp. 4378\u20134388.","DOI":"10.1145\/3581783.3611753"},{"key":"10.1016\/j.patcog.2024.110314_b8","doi-asserted-by":"crossref","unstructured":"Y. Xu, M. Li, L. Cui, S. Huang, F. Wei, M. Zhou, LayoutLM: Pre-training of Text and Layout for Document Image Understanding, in: KDD, 2020, pp. 1192\u20131200.","DOI":"10.1145\/3394486.3403172"},{"issue":"2","key":"10.1016\/j.patcog.2024.110314_b9","doi-asserted-by":"crossref","first-page":"2769","DOI":"10.3233\/JIFS-220705","article-title":"MFRCNN: Marshalled FRCNN with optimized reading order in XY tree for document layout analysis in scientific research articles","volume":"44","author":"Rose","year":"2023","journal-title":"J. Intell. Fuzzy Systems"},{"key":"10.1016\/j.patcog.2024.110314_b10","doi-asserted-by":"crossref","unstructured":"C. Zhang, Y. Guo, Y. Tu, H. Chen, J. Tang, H. Zhu, Q. Zhang, T. Gui, Reading Order Matters: Information Extraction from Visually-rich Documents by Token Path Prediction, in: EMNLP, 2023, pp. 13716\u201313730.","DOI":"10.18653\/v1\/2023.emnlp-main.846"},{"key":"10.1016\/j.patcog.2024.110314_b11","doi-asserted-by":"crossref","unstructured":"S.B.R. Chowdhury, F. Brahman, S. Chaturvedi, Is Everything in Order? A Simple Way to Order Sentences, in: EMNLP, 2021, pp. 10769\u201310779.","DOI":"10.18653\/v1\/2021.emnlp-main.841"},{"key":"10.1016\/j.patcog.2024.110314_b12","unstructured":"C. Guinaudeau, M. Strube, Graph-based Local Coherence Modeling, in: ACL, 2013, pp. 93\u2013103."},{"key":"10.1016\/j.patcog.2024.110314_b13","first-page":"3","article-title":"Text reading order in uncontrolled conditions by sparse graph segmentation","volume":"vol. 14192","author":"Wang","year":"2023"},{"key":"10.1016\/j.patcog.2024.110314_b14","doi-asserted-by":"crossref","unstructured":"Z. Wang, Y. Xu, L. Cui, J. Shang, F. Wei, LayoutReader: Pre-training of Text and Layout for Reading Order Detection, in: EMNLP, 2021, pp. 4735\u20134744.","DOI":"10.18653\/v1\/2021.emnlp-main.389"},{"issue":"1","key":"10.1016\/j.patcog.2024.110314_b15","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1162\/coli.2008.34.1.1","article-title":"Modeling local coherence: An entity-based approach","volume":"34","author":"Barzilay","year":"2008","journal-title":"Comput. Linguist."},{"key":"10.1016\/j.patcog.2024.110314_b16","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109337","article-title":"Beyond OCR + VQA: Towards end-to-end reading and reasoning for robust and accurate textvqa","volume":"138","author":"Zeng","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2024.110314_b17","doi-asserted-by":"crossref","unstructured":"H. Agrawal, A. Chandrasekaran, D. Batra, D. Parikh, M. Bansal, Sort Story: Sorting Jumbled Images and Captions into Stories, in: EMNLP, 2016, pp. 925\u2013931.","DOI":"10.18653\/v1\/D16-1091"},{"key":"10.1016\/j.patcog.2024.110314_b18","doi-asserted-by":"crossref","unstructured":"P. Kumar, D. Brahma, H. Karnick, P. Rai, Deep Attentive Ranking Networks for Learning to Order Sentences, in: AAAI, 2020, pp. 8115\u20138122.","DOI":"10.1609\/aaai.v34i05.6323"},{"key":"10.1016\/j.patcog.2024.110314_b19","doi-asserted-by":"crossref","unstructured":"S. Prabhumoye, R. Salakhutdinov, A.W. Black, Topological Sort for Sentence Ordering, in: ACL, 2020, pp. 2783\u20132792.","DOI":"10.18653\/v1\/2020.acl-main.248"},{"key":"10.1016\/j.patcog.2024.110314_b20","doi-asserted-by":"crossref","unstructured":"L. Logeswaran, H. Lee, D.R. Radev, Sentence Ordering and Coherence Modeling using Recurrent Neural Networks, in: AAAI, 2018, pp. 5285\u20135292.","DOI":"10.1609\/aaai.v32i1.11997"},{"key":"10.1016\/j.patcog.2024.110314_b21","doi-asserted-by":"crossref","unstructured":"H.C. Moon, T. Mohiuddin, S.R. Joty, X. Chi, A Unified Neural Coherence Model, in: EMNLP-IJCNLP, 2019, pp. 2262\u20132272.","DOI":"10.18653\/v1\/D19-1231"},{"key":"10.1016\/j.patcog.2024.110314_b22","doi-asserted-by":"crossref","unstructured":"Y. Yin, L. Song, J. Su, J. Zeng, C. Zhou, J. Luo, Graph-based Neural Sentence Ordering, in: IJCAI, 2019, pp. 5387\u20135393.","DOI":"10.24963\/ijcai.2019\/748"},{"key":"10.1016\/j.patcog.2024.110314_b23","doi-asserted-by":"crossref","unstructured":"M. Lewis, Y. Liu, N. Goyal, M. Ghazvininejad, A. Mohamed, O. Levy, V. Stoyanov, L. Zettlemoyer, BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension, in: ACL, 2020, pp. 7871\u20137880.","DOI":"10.18653\/v1\/2020.acl-main.703"},{"year":"2003","series-title":"Bidimensional Relations for Reading Order Detection","author":"Marco Aiello","key":"10.1016\/j.patcog.2024.110314_b24"},{"key":"10.1016\/j.patcog.2024.110314_b25","doi-asserted-by":"crossref","unstructured":"M. Ceci, M. Berardi, G. Porcelli, D. Malerba, A Data Mining Approach to Reading Order Detection, in: ICDAR, 2007, pp. 924\u2013928.","DOI":"10.1109\/ICDAR.2007.4377050"},{"key":"10.1016\/j.patcog.2024.110314_b26","doi-asserted-by":"crossref","unstructured":"S. Ferilli, D. Grieco, D. Redavid, F. Esposito, Abstract argumentation for reading order detection, in: ACM Symposium on Document Engineering, 2014, pp. 45\u201348.","DOI":"10.1145\/2644866.2644883"},{"issue":"10","key":"10.1016\/j.patcog.2024.110314_b27","doi-asserted-by":"crossref","first-page":"3200","DOI":"10.1016\/j.patcog.2008.03.014","article-title":"A machine-learning approach for analyzing document layout structures with two reading orders","volume":"41","author":"Wu","year":"2008","journal-title":"Pattern Recognit."},{"issue":"12","key":"10.1016\/j.patcog.2024.110314_b28","doi-asserted-by":"crossref","first-page":"9593","DOI":"10.1007\/s00521-022-06948-5","article-title":"Reading order detection on handwritten documents","volume":"34","author":"Quir\u00f3s","year":"2022","journal-title":"Neural Comput. Appl."},{"key":"10.1016\/j.patcog.2024.110314_b29","doi-asserted-by":"crossref","unstructured":"Y. Huang, T. Lv, L. Cui, Y. Lu, F. Wei, LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking, in: ACMMM, 2022, pp. 4083\u20134091.","DOI":"10.1145\/3503161.3548112"},{"key":"10.1016\/j.patcog.2024.110314_b30","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109419","article-title":"VLCDoC: Vision-Language contrastive pre-training model for cross-Modal document classification","volume":"139","author":"Bakkali","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2024.110314_b31","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep Residual Learning for Image Recognition, in: CVPR, 2016, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.patcog.2024.110314_b32","unstructured":"T. Mikolov, I. Sutskever, K. Chen, G.S. Corrado, J. Dean, Distributed Representations of Words and Phrases and their Compositionality, in: NeurIPS, 2013, pp. 3111\u20133119."},{"key":"10.1016\/j.patcog.2024.110314_b33","unstructured":"A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A.N. Gomez, L. Kaiser, I. Polosukhin, Attention is All you Need, in: NeurIPS, 2017, pp. 5998\u20136008."},{"key":"10.1016\/j.patcog.2024.110314_b34","doi-asserted-by":"crossref","unstructured":"P. Yin, G. Neubig, W. Yih, S. Riedel, TaBERT: Pretraining for Joint Understanding of Textual and Tabular Data, in: ACL, 2020, pp. 8413\u20138426.","DOI":"10.18653\/v1\/2020.acl-main.745"},{"year":"2021","series-title":"RoFormer: Enhanced transformer with rotary position embedding","author":"Su","key":"10.1016\/j.patcog.2024.110314_b35"},{"year":"2022","series-title":"TRIE++: towards end-to-end information extraction from visually rich documents","author":"Cheng","key":"10.1016\/j.patcog.2024.110314_b36"},{"key":"10.1016\/j.patcog.2024.110314_b37","unstructured":"S. Ren, K. He, R.B. Girshick, J. Sun, Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks, in: NeurIPS, 2015, pp. 91\u201399."},{"key":"10.1016\/j.patcog.2024.110314_b38","doi-asserted-by":"crossref","unstructured":"K. He, G. Gkioxari, P. Doll\u00e1r, R.B. Girshick, Mask R-CNN, in: ICCV, 2017, pp. 2980\u20132988.","DOI":"10.1109\/ICCV.2017.322"},{"issue":"2","key":"10.1016\/j.patcog.2024.110314_b39","doi-asserted-by":"crossref","first-page":"260","DOI":"10.1109\/TIT.1967.1054010","article-title":"Error bounds for convolutional codes and an asymptotically optimum decoding algorithm","volume":"13","author":"Viterbi","year":"1967","journal-title":"IEEE Trans. Inf. Theory"},{"issue":"08","key":"10.1016\/j.patcog.2024.110314_b40","doi-asserted-by":"crossref","first-page":"2298","DOI":"10.11834\/jig.220911","article-title":"SCID: a Chinese characters invoice-scanned dataset in relevant to key information extraction derived of visually-rich document images","volume":"28","author":"Qiao","year":"2023","journal-title":"J. Image Graph."},{"key":"10.1016\/j.patcog.2024.110314_b41","unstructured":"I. Loshchilov, F. Hutter, Decoupled Weight Decay Regularization, in: ICLR, 2019."},{"key":"10.1016\/j.patcog.2024.110314_b42","doi-asserted-by":"crossref","unstructured":"K. Papineni, S. Roukos, T. Ward, W. Zhu, Bleu: a Method for Automatic Evaluation of Machine Translation, in: ACL, 2002, pp. 311\u2013318.","DOI":"10.3115\/1073083.1073135"},{"key":"10.1016\/j.patcog.2024.110314_b43","series-title":"ICCV","first-page":"973","article-title":"DocFormer: End-to-end transformer for document understanding","author":"Appalaraju","year":"2021"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320324000657?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320324000657?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T22:58:00Z","timestamp":1731193080000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320324000657"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6]]},"references-count":43,"alternative-id":["S0031320324000657"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2024.110314","relation":{},"ISSN":["0031-3203"],"issn-type":[{"type":"print","value":"0031-3203"}],"subject":[],"published":{"date-parts":[[2024,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Reading order detection in visually-rich documents with multi-modal layout-aware relation prediction","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2024.110314","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 Elsevier Ltd. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"110314"}}