{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T22:36:40Z","timestamp":1730327800649,"version":"3.28.0"},"publisher-location":"New York, NY, USA","reference-count":26,"publisher":"ACM","funder":[{"name":"AI4EUROPE","award":["101070000"]},{"name":"ECSEL Joint Undertaking (JU)","award":["101007350"]},{"name":"HORIZON EUROPE","award":["952215"]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,9]]},"DOI":"10.1145\/3650203.3663326","type":"proceedings-article","created":{"date-parts":[[2024,5,29]],"date-time":"2024-05-29T20:13:23Z","timestamp":1717013603000},"page":"1-6","update-policy":"http:\/\/dx.doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Croissant: A Metadata Format for ML-Ready Datasets"],"prefix":"10.1145","author":[{"ORCID":"http:\/\/orcid.org\/0009-0003-6346-2392","authenticated-orcid":false,"given":"Mubashara","family":"Akhtar","sequence":"first","affiliation":[{"name":"King's College London"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-4173-7709","authenticated-orcid":false,"given":"Omar","family":"Benjelloun","sequence":"additional","affiliation":[{"name":"Google"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-9484-9632","authenticated-orcid":false,"given":"Costanza","family":"Conforti","sequence":"additional","affiliation":[{"name":"Google"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-7346-8075","authenticated-orcid":false,"given":"Pieter","family":"Gijsbers","sequence":"additional","affiliation":[{"name":"TUE & OpenML"}]},{"ORCID":"http:\/\/orcid.org\/0000-0003-2335-6977","authenticated-orcid":false,"given":"Joan","family":"Giner-Miguelez","sequence":"additional","affiliation":[{"name":"Universitat Oberta de Catalunya"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-7429-7949","authenticated-orcid":false,"given":"Nitisha","family":"Jain","sequence":"additional","affiliation":[{"name":"King's College London"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-0805-1828","authenticated-orcid":false,"given":"Michael","family":"Kuchnik","sequence":"additional","affiliation":[{"name":"Meta"}]},{"ORCID":"http:\/\/orcid.org\/0009-0000-7660-3420","authenticated-orcid":false,"given":"Quentin","family":"Lhoest","sequence":"additional","affiliation":[{"name":"Hugging Face"}]},{"ORCID":"http:\/\/orcid.org\/0009-0000-7193-1185","authenticated-orcid":false,"given":"Pierre","family":"Marcenac","sequence":"additional","affiliation":[{"name":"Google"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-5087-6903","authenticated-orcid":false,"given":"Manil","family":"Maskey","sequence":"additional","affiliation":[{"name":"NASA"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-5984-238X","authenticated-orcid":false,"given":"Peter","family":"Mattson","sequence":"additional","affiliation":[{"name":"Google"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-1379-8627","authenticated-orcid":false,"given":"Luis","family":"Oala","sequence":"additional","affiliation":[{"name":"Dotphoton"}]},{"ORCID":"http:\/\/orcid.org\/0009-0006-8506-6464","authenticated-orcid":false,"given":"Pierre","family":"Ruyssen","sequence":"additional","affiliation":[{"name":"Google"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-9505-6204","authenticated-orcid":false,"given":"Rajat","family":"Shinde","sequence":"additional","affiliation":[{"name":"NASA IMPACT & UAH"}]},{"ORCID":"http:\/\/orcid.org\/0000-0003-1722-947X","authenticated-orcid":false,"given":"Elena","family":"Simperl","sequence":"additional","affiliation":[{"name":"King's College London and Open Data Institute"}]},{"ORCID":"http:\/\/orcid.org\/0009-0002-5654-9644","authenticated-orcid":false,"given":"Goeffry","family":"Thomas","sequence":"additional","affiliation":[{"name":"Google and Kaggle"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-9447-9830","authenticated-orcid":false,"given":"Slava","family":"Tykhonov","sequence":"additional","affiliation":[{"name":"DANS-KNAW"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-7044-9805","authenticated-orcid":false,"given":"Joaquin","family":"Vanschoren","sequence":"additional","affiliation":[{"name":"TUE & OpenML"}]},{"ORCID":"http:\/\/orcid.org\/0000-0003-0430-1532","authenticated-orcid":false,"given":"Jos","family":"van der Velde","sequence":"additional","affiliation":[{"name":"TUE & OpenML"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-7046-0606","authenticated-orcid":false,"given":"Steffen","family":"Vogler","sequence":"additional","affiliation":[{"name":"Bayer"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-9032-7239","authenticated-orcid":false,"given":"Carole-Jean","family":"Wu","sequence":"additional","affiliation":[{"name":"Meta"}]}],"member":"320","published-online":{"date-parts":[[2024,6,9]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"33","article-title":"Diagnosing and removing performance bottlenecks in machine learning data pipelines","volume":"4","author":"Kuchnik Michael","year":"2022","unstructured":"Michael Kuchnik, Ana Klimovic, Jiri Simsa, Virginia Smith, and George Amvrosiadis. Plumber: Diagnosing and removing performance bottlenecks in machine learning data pipelines. Proceedings of Machine Learning and Systems, 4: 33--51, 2022.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_2_1","volume-title":"Journal of Data-centric Machine Learning Research","author":"Oala Luis","year":"2024","unstructured":"Luis Oala, Manil Maskey, Lilith Bat-Leah, Alicia Parrish, Nezihe Merve G\u00fcrel, Tzu-Sheng Kuo, Yang Liu, Rotem Dror, Danilo Brajovic, Xiaozhe Yao, Max Bartolo, William A Gaviria Rojas, Ryan Hileman, Rainier Aliment, Michael W. Mahoney, Meg Risdal, Matthew Lease, Wojciech Samek, Debojyoti Dutta, Curtis G Northcutt, Cody Coleman, Braden Hancock, Bernard Koch, Girmaw Abebe Tadesse, Bojan Karla\u0161, Ahmed Alaa, Adji Bousso Dieng, Natasha Noy, Vijay Janapa Reddi, James Zou, Praveen Paritosh, Mihaela van der Schaar, Kurt Bollacker, Lora Aroyo, Ce Zhang, Joaquin Vanschoren, Isabelle Guyon, and Peter Mattson. DMLR: Data-centric machine learning research - past, present and future. Journal of Data-centric Machine Learning Research, 2024. URL https:\/\/openreview.net\/forum?id=2kpu78QdeE. Featured Certification, Survey Certification."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445518"},{"key":"e_1_3_2_1_4_1","volume-title":"Croissant format specification. Technical report","author":"Benjelloun Omar","year":"2024","unstructured":"Omar Benjelloun, Elena Simperl, Pierre Marcenac, Pierre Ruyssen, Costanza Conforti, Michael Kuchnik, Jos van der Velde, Luis Oala, Steffen Vogler, Mubashara Akthar, Nitisha Jain, and Slava Tykhonov. Croissant format specification. Technical report, 2024. URL https:\/\/mlcommons.org\/croissant\/1.0."},{"key":"e_1_3_2_1_5_1","volume-title":"Fashion-mnist: a novel image dataset for benchmarking machine learning algorithms","author":"Xiao Han","year":"2017","unstructured":"Han Xiao, Kashif Rasul, and Roland Vollgraf. Fashion-mnist: a novel image dataset for benchmarking machine learning algorithms. 2017."},{"key":"e_1_3_2_1_6_1","volume-title":"Hanna Wallach, Hal Daum\u00e9 III, and Kate Crawford. Datasheets for datasets","author":"Gebru Timnit","year":"2021","unstructured":"Timnit Gebru, Jamie Morgenstern, Briana Vecchione, Jennifer Wortman Vaughan, Hanna Wallach, Hal Daum\u00e9 III, and Kate Crawford. Datasheets for datasets, 2021."},{"key":"e_1_3_2_1_7_1","volume-title":"Alejandra Gonzalez Beltran, Andrea Perego, and Peter Winstanley. Data catalog vocabulary (DCAT) - version 3. https:\/\/www.w3.org\/TR\/vocab-dcat-3\/, 01","author":"Albertoni Riccardo","year":"2024","unstructured":"Riccardo Albertoni, David Browning, Simon JD Cox, Alejandra Gonzalez Beltran, Andrea Perego, and Peter Winstanley. Data catalog vocabulary (DCAT) - version 3. https:\/\/www.w3.org\/TR\/vocab-dcat-3\/, 01 2024. (Accessed on 03\/18\/2024)."},{"volume-title":"https:\/\/github.com\/schemaorg\/schemaorg\/tree\/main\/data\/releases\/26.0\/, 02","year":"2024","key":"e_1_3_2_1_8_1","unstructured":"schema.org. Schema.org v26.0. https:\/\/github.com\/schemaorg\/schemaorg\/tree\/main\/data\/releases\/26.0\/, 02 2024. (Accessed on 03\/18\/2024)."},{"key":"e_1_3_2_1_9_1","unstructured":"Data packages. https:\/\/specs.frictionlessdata.io\/. (Accessed on 03\/21\/2024)."},{"key":"e_1_3_2_1_10_1","unstructured":"Csv on the web: A primer. https:\/\/www.w3.org\/TR\/tabular-data-primer\/. (Accessed on 03\/21\/2024)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Stian Soiland-Reyes Merc\u00e8 Crosas Peter Sefton Leyla Jael Castro Frederik Coppens Jos\u00e9 M. Fern\u00e1ndez Daniel Garijo Marco La Rosa Bj\u00f6rn Gr\u00fcning Simone Leo Eoghan \u00d3 Carrag\u00e1in Marc Portier Ana Trisovic RO-Crate Community Paul Groth and Carole Goble. Packaging research artefacts with ro-crate. Data Science 5(2) 2022.","DOI":"10.3233\/DS-210053"},{"key":"e_1_3_2_1_12_1","unstructured":"Open archives initiative object exchange and reuse. https:\/\/www.openarchives.org\/ore\/. (Accessed on 03\/21\/2024)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1177\/0049124107306660"},{"key":"e_1_3_2_1_14_1","unstructured":"Ckan. https:\/\/ckan.org\/. (Accessed on 03\/21\/2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Arrow columnar format --- apache arrow v15.0.1. https:\/\/arrow.apache.org\/docs\/format\/Columnar.html, 01","author":"Foundation Apache Software","year":"2024","unstructured":"Apache Software Foundation. Arrow columnar format --- apache arrow v15.0.1. https:\/\/arrow.apache.org\/docs\/format\/Columnar.html, 01 2024. (Accessed on 03\/16\/2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"Apache parquet. https:\/\/parquet.apache.org\/docs\/file-format\/, 11","author":"Foundation Apache Software","year":"2023","unstructured":"Apache Software Foundation. Apache parquet. https:\/\/parquet.apache.org\/docs\/file-format\/, 11 2023. (Accessed on 03\/16\/2024)."},{"volume-title":"huggingface\/safetensors: Simple, safe way to store and distribute tensors v0.4.2. https:\/\/github.com\/huggingface\/safetensors, 01","year":"2024","key":"e_1_3_2_1_17_1","unstructured":"Huggingface. huggingface\/safetensors: Simple, safe way to store and distribute tensors v0.4.2. https:\/\/github.com\/huggingface\/safetensors, 01 2024. (Accessed on 03\/18\/2024)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415560"},{"key":"e_1_3_2_1_19_1","volume-title":"Benchmarking random access in lance. https:\/\/blog.lancedb.com\/announcing-lancedb-5cb0deaa46ee-2\/, 03","author":"She Chang","year":"2023","unstructured":"Chang She. Benchmarking random access in lance. https:\/\/blog.lancedb.com\/announcing-lancedb-5cb0deaa46ee-2\/, 03 2023. (Accessed on 03\/18\/2024)."},{"key":"e_1_3_2_1_20_1","unstructured":"Ibis project. https:\/\/ibis-project.org\/. (Accessed on 03\/21\/2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00041"},{"key":"e_1_3_2_1_22_1","volume-title":"Data cards: Purposeful and transparent dataset documentation for responsible ai","author":"Pushkarna Mahima","year":"2022","unstructured":"Mahima Pushkarna, Andrew Zaldivar, and Oddur Kjartansson. Data cards: Purposeful and transparent dataset documentation for responsible ai, 2022."},{"key":"e_1_3_2_1_23_1","volume-title":"Croissant rai specification. Technical report","author":"Akhtar Mubashara","year":"2024","unstructured":"Mubashara Akhtar, Nitisha Jain, Joan Giner-Miguelez, Omar Benjelloun, Elena Simperl, Lora Aroyo, Rajat Shinde, Luis Oala, and Michael Kuchnik. Croissant rai specification. Technical report, 2024. URL https:\/\/mlcommons.org\/croissant\/RAI\/1.0."},{"key":"e_1_3_2_1_24_1","volume-title":"Microsoft coco: Common objects in context","author":"Lin Tsung-Yi","year":"2015","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, Lubomir Bourdev, Ross Girshick, James Hays, Pietro Perona, Deva Ramanan, C. Lawrence Zitnick, and Piotr Doll\u00e1r. Microsoft coco: Common objects in context, 2015."},{"key":"e_1_3_2_1_25_1","volume-title":"a collection of ready-to-use datasets. https:\/\/www.tensorflow.org\/datasets, 03","author":"TensorFlow Datasets TFDS.","year":"2024","unstructured":"TFDS. TensorFlow Datasets, a collection of ready-to-use datasets. https:\/\/www.tensorflow.org\/datasets, 03 2024."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308558.3313685"}],"event":{"name":"SIGMOD\/PODS '24: International Conference on Management of Data","sponsor":["SIGMOD ACM Special Interest Group on Management of Data"],"location":"Santiago AA Chile","acronym":"SIGMOD\/PODS '24"},"container-title":["Proceedings of the Eighth Workshop on Data Management for End-to-End Machine Learning"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650203.3663326","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,10]],"date-time":"2024-07-10T10:39:42Z","timestamp":1720607982000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650203.3663326"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,9]]},"references-count":26,"alternative-id":["10.1145\/3650203.3663326","10.1145\/3650203"],"URL":"https:\/\/doi.org\/10.1145\/3650203.3663326","relation":{},"subject":[],"published":{"date-parts":[[2024,6,9]]},"assertion":[{"value":"2024-06-09","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}